From 8b9f912f00cf7688bdfb6e1aee2c6b766670e18a Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com>
Date: Mon, 4 May 2026 17:01:20 -0400
Subject: [PATCH 01/14] feat: add job workload to CRUD module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add create_job() to NodePoolCRUD that deploys K8s Jobs onto node pools.
Unlike deployments/statefulsets which run indefinitely, Jobs are
run-to-completion workloads — success means the pod terminated cleanly
(succeeded > 0), failure raises immediately (no self-healing).

- Add 'jobs' subcommand to handle_workload_operations() in main.py
  with --number-of-jobs and --completions args
- Add job.yml workload template with configurable completions and
  node affinity via label_selector
- Add _check_job_condition and _is_job_condition_met to
  kubernetes_client.py — checks completion_time + succeeded count
- Add wait_for_job_completed with 5-min timeout and 30s polling
- Job kind support in apply/update/delete manifest methods
---
 modules/python/clients/kubernetes_client.py   |  58 ++++++++++
 modules/python/crud/azure/node_pool_crud.py   | 108 ++++++++++++++++++
 modules/python/crud/main.py                   |  41 +++++++
 .../python/crud/workload_templates/job.yml    |  20 ++++
 4 files changed, 227 insertions(+)
 create mode 100644 modules/python/crud/workload_templates/job.yml

diff --git a/modules/python/clients/kubernetes_client.py b/modules/python/clients/kubernetes_client.py
index 60838d2ce4..9251351f5e 100644
--- a/modules/python/clients/kubernetes_client.py
+++ b/modules/python/clients/kubernetes_client.py
@@ -975,6 +975,8 @@ def wait_for_condition(self, resource_type: str, wait_condition_type: str, names
         valid_conditions = {
             'deployment': ['available', 'progressing', 'replicafailure', 'ready'],
             'deployments': ['available', 'progressing', 'replicafailure', 'ready'],
+            'job': ['complete', 'failed'],
+            'jobs': ['complete', 'failed'],
             # Add more resource types as needed
         }
 
@@ -1048,6 +1050,9 @@ def _check_resource_condition(self, resource_type: str, resource_name: str, cond
 
             if resource_type_lower in ['deployment', 'deployments']:
                 return self._check_deployment_condition(resource_name, condition_type, namespace, wait_all)
+            
+            if resource_type_lower in ['job', 'jobs']:
+                return self._check_job_condition(resource_name, condition_type, namespace, wait_all)
 
             logger.warning(f"Unsupported resource type for condition checking: {resource_type}")
             return False
@@ -1097,6 +1102,47 @@ def _is_deployment_condition_met(self, deployment, condition_type: str) -> bool:
 
         return False
 
+    def _check_job_condition(self, resource_name: str, condition_type: str, namespace: str, wait_all: bool) -> bool:
+        """Check job condition ('complete' or 'failed')."""
+        try:
+            if wait_all or not resource_name:
+                jobs = self.batch.list_namespaced_job(namespace=namespace).items
+            else:
+                job = self.batch.read_namespaced_job(name=resource_name, namespace=namespace)
+                jobs = [job]
+
+            for job in jobs:
+                if not self._is_job_condition_met(job, condition_type):
+                    return False
+
+            return True
+
+        except client.rest.ApiException as e:
+            if e.status == 404:
+                logger.debug("Job not found, waiting...")
+                return False
+            raise e
+
+    def _is_job_condition_met(self, job, condition_type: str) -> bool:
+        """Check if a job meets the specified condition."""
+        if not job.status:
+            return False
+
+        condition_type_lower = condition_type.lower()
+
+        if condition_type_lower == 'complete':
+            return bool(job.status.completion_time and job.status.succeeded and job.status.succeeded > 0)
+
+        if condition_type_lower == 'failed':
+            return bool(job.status.failed and job.status.failed > 0 and not job.status.active)
+
+        if job.status.conditions:
+            for condition in job.status.conditions:
+                if condition.type.lower() == condition_type_lower and condition.status == "True":
+                    return True
+
+        return False
+
     def _expand_and_validate_manifests(self, manifests):
         """
         Validate and expand manifests, handling List kind and non-dict manifests.
@@ -1159,6 +1205,8 @@ def _apply_single_manifest(self, manifest, namespace=None):
                 self.app.create_namespaced_daemon_set(namespace=namespace, body=manifest)
             elif kind == "StatefulSet":
                 self.app.create_namespaced_stateful_set(namespace=namespace, body=manifest)
+            elif kind == "Job":
+                self.batch.create_namespaced_job(namespace=namespace, body=manifest)
             elif kind == "Service":
                 self.api.create_namespaced_service(namespace=namespace, body=manifest)
             elif kind == "ConfigMap":
@@ -1313,6 +1361,11 @@ def _update_single_manifest(self, manifest, namespace=None):
                     self.app.patch_namespaced_stateful_set(name=name, namespace=namespace, body=manifest)
                 else:
                     raise ValueError("StatefulSet requires a namespace")
+            elif kind == "Job":
+                if namespace:
+                    self.batch.patch_namespaced_job(name=name, namespace=namespace, body=manifest)
+                else:
+                    raise ValueError("Job requires a namespace")
             elif kind == "Service":
                 if namespace:
                     self.api.patch_namespaced_service(name=name, namespace=namespace, body=manifest)
@@ -1494,6 +1547,11 @@ def _delete_single_manifest(self, manifest, ignore_not_found: bool = True, names
                     self.app.delete_namespaced_stateful_set(name=resource_name, namespace=namespace, body=delete_options)
                 else:
                     raise ValueError("StatefulSet requires a namespace")
+            elif kind == "Job":
+                if namespace:
+                    self.batch.delete_namespaced_job(name=resource_name, namespace=namespace, body=delete_options)
+                else:
+                    raise ValueError("Job requires a namespace")
             elif kind == "Service":
                 if namespace:
                     self.api.delete_namespaced_service(name=resource_name, namespace=namespace, body=delete_options)
diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py
index 208589d3d7..ff3321a21b 100644
--- a/modules/python/crud/azure/node_pool_crud.py
+++ b/modules/python/crud/azure/node_pool_crud.py
@@ -409,3 +409,111 @@ def create_deployment(
             return True
         logger.warning("Created %d/%d deployment(s)", successful_deployments, number_of_deployments)
         return False
+
+    def create_job(
+        self,
+        node_pool_name,
+        completions=1,
+        manifest_dir=None,
+        number_of_jobs=1,
+        label_selector="app=nginx-container",
+        namespace="default"
+    ):
+        """
+        Create Kubernetes jobs after node pool operations.
+
+        Args:
+            node_pool_name: Name of the node pool to target
+            job_name: Base name for the jobs
+            namespace: Kubernetes namespace (default: "default")
+            completions: Number of job completions (default: 1)
+            manifest_dir: Directory containing Kubernetes manifest files
+            number_of_jobs: Number of jobs to create (default: 1)
+
+        Returns:
+            True if all job creations were successful, False otherwise
+        """
+        logger.info("Creating %d job(s)", number_of_jobs)
+        logger.info("Target node pool: %s", node_pool_name)
+        logger.info("Job completions: %d", completions)
+        logger.info("Using manifest directory: %s", manifest_dir)
+
+        try:
+            # Get Kubernetes client from AKS client
+            k8s_client = self.aks_client.k8s_client
+
+            if not k8s_client:
+                logger.error("Kubernetes client not available")
+                return False
+
+            successful_jobs = 0
+
+            # Loop through number of jobs
+            for job_index in range(1, number_of_jobs + 1):
+                logger.info("Creating job %d/%d", job_index, number_of_jobs)
+
+                try:
+                    if manifest_dir:
+                        # Use the template path from manifest_dir
+                        template_path = f"{manifest_dir}/job.yml"
+                    else:
+                        # Use default template path
+                        template_path = "modules/python/crud/workload_templates/job.yml"
+
+                    # Generate job name
+                    job_name = f"myapp-{node_pool_name}-{job_index}"
+
+                    # Create job template using k8s_client.create_template
+                    job_template = k8s_client.create_template(
+                        template_path,
+                        {
+                            "JOB_COMPLETIONS": completions,
+                            "NODE_POOL_NAME": node_pool_name,
+                            "INDEX": job_index,
+                            "LABEL_VALUE": label_selector.split("=", 1)[-1],
+                        }
+                    )
+
+                    # Apply each document in the rendered multi-doc template
+                    for doc in yaml.safe_load_all(job_template):
+                        if doc:
+                            k8s_client.apply_manifest_from_file(manifest_dict=doc)
+
+                    logger.info("Applied manifest for job %s", job_name)
+
+                    # Wait for job to complete (successful job verification)
+                    logger.info("Waiting for job %s to complete...", job_name)
+                    job_ready = k8s_client.wait_for_condition(
+                        resource_type="job",
+                        wait_condition_type="complete",
+                        resource_name=job_name,
+                        namespace=namespace,
+                        timeout_seconds=self.step_timeout
+                    )
+
+                    if job_ready:
+                        logger.info("Job %s is successfully complete", job_name)
+                        logger.info("Successfully created and verified job %d", job_index)
+                        successful_jobs += 1
+                    else:
+                        logger.error("Job %s failed to complete within timeout", job_name)
+                        continue
+
+                except Exception as e:
+                    logger.error("Failed to create job %d: %s", job_index, e)
+                    # Continue with next job instead of failing completely
+                    continue
+
+            # Check if all jobs were successful
+            if successful_jobs == number_of_jobs:
+                logger.info("Successfully created all %d job(s)", number_of_jobs)
+                return True
+            if successful_jobs > 0:
+                logger.warning("Created %d/%d job(s)", successful_jobs, number_of_jobs)
+                return False
+            logger.error("Failed to create any jobs")
+            return False
+
+        except Exception as e:
+            logger.error("Failed to create jobs: %s", e)
+            return False
diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py
index 635d14462a..34e3055fee 100644
--- a/modules/python/crud/main.py
+++ b/modules/python/crud/main.py
@@ -167,6 +167,17 @@ def handle_workload_operations(node_pool_crud, args):
             }
 
             result = node_pool_crud.create_deployment(**deploy_kwargs)
+        elif command == "jobs":
+            # Prepare job arguments
+            job_kwargs = {
+                "node_pool_name": args.node_pool_name,
+                "completions": args.completions,
+                "manifest_dir": args.manifest_dir,
+                "number_of_jobs": args.number_of_jobs,
+                "label_selector": args.label_selector,
+            }
+
+            result = node_pool_crud.create_job(**job_kwargs)
         else:
             logger.error("Unknown workload command: '%s'", command)
             return 1
@@ -386,6 +397,36 @@ def main():
     )
     deployment_parser.set_defaults(func=handle_workload_operations)
 
+    # Jobs command - add after the "deployment" command parser
+    jobs_parser = subparsers.add_parser(
+        "jobs", parents=[common_parser], help="create jobs"
+    )
+    jobs_parser.add_argument("--node-pool-name", required=True, help="Node pool name")
+    jobs_parser.add_argument(
+        "--number-of-jobs",
+        type=int,
+        default=1,
+        help="Number of jobs"
+    )
+    jobs_parser.add_argument(
+        "--completions",
+        type=int,
+        default=1,
+        help="Number of job completions"
+    )
+    jobs_parser.add_argument(
+        "--manifest-dir",
+        required=True,
+        help="Directory containing Kubernetes manifest files for the job"
+    )
+    jobs_parser.add_argument(
+        "--label-selector",
+        default="app=nginx-container",
+        help="Label selector for created job pods (default: app=nginx-container)"
+    )
+
+    jobs_parser.set_defaults(func=handle_workload_operations)
+
     # Arguments provided, run node pool operations and collect benchmark results
     try:
         args = parser.parse_args()
diff --git a/modules/python/crud/workload_templates/job.yml b/modules/python/crud/workload_templates/job.yml
new file mode 100644
index 0000000000..d9627dfcb5
--- /dev/null
+++ b/modules/python/crud/workload_templates/job.yml
@@ -0,0 +1,20 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: myapp-{{NODE_POOL_NAME}}-{{INDEX}}
+  labels:
+    app: {{LABEL_VALUE}}
+spec:
+  completions: {{JOB_COMPLETIONS}}
+  template:
+    metadata:
+      labels:
+        app: {{LABEL_VALUE}}
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: {{LABEL_VALUE}}
+          image: mcr.microsoft.com/oss/nginx/nginx:1.21.6
+          command: ["nginx", "-t"]
+          ports:
+            - containerPort: 80
\ No newline at end of file

From cf94b1bd3daf335cbbdf52c9c40be64f516b2b89 Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com>
Date: Mon, 4 May 2026 17:01:29 -0400
Subject: [PATCH 02/14] feat: wire job workload through pipeline topology
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add job execution step to the k8s CRUD engine pipeline between
deployment and scale-down. Parameters (number_of_jobs, completions)
flow from pipeline matrix → topology → engine step → main.py.

- Add jobs script block to steps/engine/crud/k8s/execute.yml
- Pass number_of_jobs and completions through topology execute-crud.yml
- Jobs run after deployment, before scale-down + delete
---
 steps/engine/crud/k8s/execute.yml            | 30 ++++++++++++++++++++
 steps/topology/k8s-crud-gpu/execute-crud.yml |  2 ++
 2 files changed, 32 insertions(+)

diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml
index bd3fbef2e3..7adf75f88e 100644
--- a/steps/engine/crud/k8s/execute.yml
+++ b/steps/engine/crud/k8s/execute.yml
@@ -10,6 +10,8 @@ parameters:
   step_wait_time: 30
   gpu_node_pool: false
   count: 1
+  number_of_jobs: 1
+  completions: 1
   replicas: 10
   manifest_dir: ""
 
@@ -86,6 +88,34 @@ steps:
       REPLICAS: ${{ parameters.replicas }}
       MANIFEST_DIR: ${{ parameters.manifest_dir }}
 
+- script: |
+    set -eo pipefail
+
+    # Deploy Jobs
+    PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" jobs \
+      --cloud "$CLOUD" \
+      --run-id "$RUN_ID" \
+      --result-dir "$RESULT_DIR" \
+      --node-pool-name "$POOL_NAME" \
+      --number-of-jobs "$NUMBER_OF_JOBS" \
+      --completions "$COMPLETIONS" \
+      --manifest-dir "$MANIFEST_DIR" \
+      --step-timeout "$STEP_TIME_OUT" \
+      ${GPU_NODE_POOL:+--gpu-node-pool}
+  displayName: 'Execute K8s Job operations for ${{ parameters.cloud }}'
+  workingDirectory: modules/python
+  env:
+    PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py
+    POOL_NAME: ${{ parameters.pool_name }}
+    CLOUD: ${{ parameters.cloud }}
+    STEP_TIME_OUT: ${{ parameters.step_time_out }}
+    RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID)
+    GPU_NODE_POOL: ${{ parameters.gpu_node_pool }}
+    NUMBER_OF_JOBS: ${{ parameters.number_of_jobs }}
+    COMPLETIONS: ${{ parameters.completions }}
+    MANIFEST_DIR: ${{ parameters.manifest_dir }}
+
+
 - script: |
     set -eo pipefail
 
diff --git a/steps/topology/k8s-crud-gpu/execute-crud.yml b/steps/topology/k8s-crud-gpu/execute-crud.yml
index 1c2b02d9c6..dc6da2b464 100644
--- a/steps/topology/k8s-crud-gpu/execute-crud.yml
+++ b/steps/topology/k8s-crud-gpu/execute-crud.yml
@@ -23,5 +23,7 @@ steps:
     gpu_node_pool: $(GPU_NODE_POOL)
     step_wait_time: $(STEP_WAIT_TIME)
     count: $(COUNT)
+    number_of_jobs: $(NUMBER_OF_JOBS)
+    completions: $(COMPLETIONS)
     replicas: $(REPLICAS)
     manifest_dir: $(MANIFEST_DIR)

From f13d1f1920b5e0b0469525a478a65ecd08099fd5 Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com>
Date: Mon, 4 May 2026 17:01:49 -0400
Subject: [PATCH 03/14] test: add unit tests for job CRUD operations

Add comprehensive test coverage for create_job and job wait_for_condition:

- test_create_job_success: single job completes successfully
- test_create_job_failure: job fails to complete
- test_create_job_partial_success: continues on individual failures
- test_job_wait_for_condition: validates _check_job_condition and
  _is_job_condition_met for 'complete' and 'failed' states
- test_wait_for_job_completed: timeout and polling behavior
- Tests cover Job-specific semantics (succeeded count, completion_time,
  failed + no active pods)
---
 .../tests/clients/test_kubernetes_client.py   | 220 ++++++++++++++++++
 .../tests/crud/test_azure_node_pool_crud.py   |  70 ++++++
 2 files changed, 290 insertions(+)

diff --git a/modules/python/tests/clients/test_kubernetes_client.py b/modules/python/tests/clients/test_kubernetes_client.py
index 8190789361..318ca2e13d 100644
--- a/modules/python/tests/clients/test_kubernetes_client.py
+++ b/modules/python/tests/clients/test_kubernetes_client.py
@@ -2586,6 +2586,36 @@ def test_apply_single_manifest_statefulset_no_namespace(self, mock_create_statef
         mock_create_statefulset.assert_called_once_with(
             namespace="default", body=manifest)
 
+    @patch('kubernetes.client.BatchV1Api.create_namespaced_job')
+    def test_apply_single_manifest_job(self, mock_create_job):
+        """Test _apply_single_manifest with Job resource."""
+        manifest = {
+            "apiVersion": "batch/v1",
+            "kind": "Job",
+            "metadata": {"name": "test-job", "namespace": "test-namespace"},
+            "spec": {}
+        }
+
+        # pylint: disable=protected-access
+        self.client._apply_single_manifest(manifest)
+        mock_create_job.assert_called_once_with(
+            namespace="test-namespace", body=manifest)
+
+    @patch('kubernetes.client.BatchV1Api.create_namespaced_job')
+    def test_apply_single_manifest_job_no_namespace(self, mock_create_job):
+        """Test _apply_single_manifest with Job missing namespace uses 'default'."""
+        manifest = {
+            "apiVersion": "batch/v1",
+            "kind": "Job",
+            "metadata": {"name": "test-job"},
+            "spec": {}
+        }
+
+        # pylint: disable=protected-access
+        self.client._apply_single_manifest(manifest)
+        mock_create_job.assert_called_once_with(
+            namespace="default", body=manifest)
+
     @patch('kubernetes.client.CoreV1Api.create_namespaced_service')
     def test_apply_single_manifest_service(self, mock_create_service):
         """Test _apply_single_manifest with Service resource."""
@@ -3407,6 +3437,160 @@ def test_wait_for_condition_case_insensitive(self):
                     )
                     self.assertTrue(result)
 
+    # Tests for wait_for_condition with job resource type
+
+    @patch('time.time')
+    def test_wait_for_condition_job_complete_success(self, mock_time):
+        """Test wait_for_condition for a completed job - success case"""
+        mock_time.side_effect = [0, 0, 1, 2, 2]
+
+        mock_job = MagicMock()
+        mock_job.status.completion_time = "2026-04-20T00:00:00Z"
+        mock_job.status.succeeded = 1
+        mock_job.status.failed = None
+        mock_job.status.active = None
+        mock_job.status.conditions = None
+
+        with patch.object(self.client, 'batch') as mock_batch, \
+             patch('time.sleep'):
+            mock_batch.read_namespaced_job.return_value = mock_job
+
+            result = self.client.wait_for_condition(
+                resource_type="job",
+                resource_name="test-job",
+                wait_condition_type="complete",
+                namespace="test-namespace",
+                timeout_seconds=5
+            )
+
+            self.assertTrue(result)
+            mock_batch.read_namespaced_job.assert_called_with(
+                name="test-job",
+                namespace="test-namespace"
+            )
+
+    @patch('time.time')
+    def test_wait_for_condition_job_complete_timeout(self, mock_time):
+        """Test wait_for_condition for a job - timeout when not yet complete"""
+        mock_time.side_effect = [0, 0, 2, 5, 6, 6]
+
+        mock_job = MagicMock()
+        mock_job.status.completion_time = None
+        mock_job.status.succeeded = 0
+        mock_job.status.failed = None
+        mock_job.status.active = 1
+        mock_job.status.conditions = None
+
+        with patch.object(self.client, 'batch') as mock_batch, \
+             patch('time.sleep'):
+            mock_batch.read_namespaced_job.return_value = mock_job
+
+            result = self.client.wait_for_condition(
+                resource_type="job",
+                resource_name="test-job",
+                wait_condition_type="complete",
+                namespace="test-namespace",
+                timeout_seconds=1
+            )
+
+            self.assertFalse(result)
+
+    @patch('time.time')
+    def test_wait_for_condition_job_failed_success(self, mock_time):
+        """Test wait_for_condition for a failed job - detects failure"""
+        mock_time.side_effect = [0, 0, 1, 2, 2]
+
+        mock_job = MagicMock()
+        mock_job.status.completion_time = None
+        mock_job.status.succeeded = 0
+        mock_job.status.failed = 3
+        mock_job.status.active = 0
+        mock_job.status.conditions = None
+
+        with patch.object(self.client, 'batch') as mock_batch, \
+             patch('time.sleep'):
+            mock_batch.read_namespaced_job.return_value = mock_job
+
+            result = self.client.wait_for_condition(
+                resource_type="job",
+                resource_name="test-job",
+                wait_condition_type="failed",
+                namespace="test-namespace",
+                timeout_seconds=5
+            )
+
+            self.assertTrue(result)
+
+    @patch('time.time')
+    def test_wait_for_condition_all_jobs_complete(self, mock_time):
+        """Test wait_for_condition for all jobs in a namespace - all complete"""
+        mock_time.side_effect = [0, 0, 1, 2, 2]
+
+        mock_job1 = MagicMock()
+        mock_job1.status.completion_time = "2026-04-20T00:00:00Z"
+        mock_job1.status.succeeded = 1
+        mock_job1.status.failed = None
+        mock_job1.status.active = None
+        mock_job1.status.conditions = None
+
+        mock_job2 = MagicMock()
+        mock_job2.status.completion_time = "2026-04-20T00:01:00Z"
+        mock_job2.status.succeeded = 2
+        mock_job2.status.failed = None
+        mock_job2.status.active = None
+        mock_job2.status.conditions = None
+
+        with patch.object(self.client, 'batch') as mock_batch, \
+             patch('time.sleep'):
+            mock_batch.list_namespaced_job.return_value.items = [mock_job1, mock_job2]
+
+            result = self.client.wait_for_condition(
+                resource_type="job",
+                resource_name=None,
+                wait_condition_type="complete",
+                namespace="test-namespace",
+                timeout_seconds=5,
+                wait_all=True
+            )
+
+            self.assertTrue(result)
+            mock_batch.list_namespaced_job.assert_called_with(namespace="test-namespace")
+
+    @patch('time.time')
+    def test_wait_for_condition_job_not_found(self, mock_time):
+        """Test wait_for_condition for a job that doesn't exist - times out"""
+        mock_time.side_effect = [0, 0, 2, 5, 6, 6]
+
+        api_exception = ApiException(status=404, reason="Not Found")
+
+        with patch.object(self.client, 'batch') as mock_batch, \
+             patch('time.sleep'):
+            mock_batch.read_namespaced_job.side_effect = api_exception
+
+            result = self.client.wait_for_condition(
+                resource_type="job",
+                resource_name="nonexistent-job",
+                wait_condition_type="complete",
+                namespace="test-namespace",
+                timeout_seconds=1
+            )
+
+            self.assertFalse(result)
+
+    def test_wait_for_condition_job_invalid_condition(self):
+        """Test wait_for_condition with invalid condition for job resource"""
+        with self.assertRaises(ValueError) as context:
+            self.client.wait_for_condition(
+                resource_type="job",
+                resource_name="test-job",
+                wait_condition_type="available",
+                namespace="test-namespace",
+                timeout_seconds=1
+            )
+
+        self.assertIn("Invalid condition 'available' for resource type 'job'", str(context.exception))
+        self.assertIn("Valid conditions: complete, failed", str(context.exception))
+
     # Tests for the enhanced apply_manifest_from_file method with folder support
     @patch('os.path.isdir')
     @patch('os.path.isfile')
@@ -4020,6 +4204,42 @@ def test_delete_single_manifest_statefulset_no_namespace(self):
 
         self.assertIn("StatefulSet requires a namespace", str(context.exception))
 
+    def test_delete_single_manifest_job(self):
+        """Test deleting a single Job manifest."""
+        manifest = {
+            "apiVersion": "batch/v1",
+            "kind": "Job",
+            "metadata": {"name": "test-job", "namespace": "test-namespace"},
+            "spec": {}
+        }
+
+        with patch.object(self.client, 'batch') as mock_batch:
+            mock_batch.delete_namespaced_job.return_value = None
+
+            # pylint: disable=protected-access
+            self.client._delete_single_manifest(manifest)
+
+            mock_batch.delete_namespaced_job.assert_called_once_with(
+                name="test-job",
+                namespace="test-namespace",
+                body=unittest.mock.ANY
+            )
+
+    def test_delete_single_manifest_job_no_namespace(self):
+        """Test _delete_single_manifest with Job missing namespace raises error."""
+        manifest = {
+            "apiVersion": "batch/v1",
+            "kind": "Job",
+            "metadata": {"name": "test-job"},
+            "spec": {}
+        }
+
+        with self.assertRaises(Exception) as context:
+            # pylint: disable=protected-access
+            self.client._delete_single_manifest(manifest)
+
+        self.assertIn("Job requires a namespace", str(context.exception))
+
     def test_delete_single_manifest_service(self):
         """Test deleting a single service manifest."""
         manifest = {
diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py
index 9064a5af4a..7baabe28df 100644
--- a/modules/python/tests/crud/test_azure_node_pool_crud.py
+++ b/modules/python/tests/crud/test_azure_node_pool_crud.py
@@ -393,5 +393,75 @@ def test_create_deployment_partial_success(self):
         # Verify create_template was called 3 times (attempted all deployments)
         self.assertEqual(mock_k8s_client.create_template.call_count, 3)
 
+    def test_create_job_success(self):
+        """Test successful job creation"""
+        # Setup
+        mock_k8s_client = mock.MagicMock()
+        self.mock_aks_client.k8s_client = mock_k8s_client
+        # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop
+        mock_k8s_client.create_template.return_value = "apiVersion: batch/v1\nkind: Job\n"
+        mock_k8s_client.wait_for_condition.return_value = True
+
+        # Execute
+        result = self.node_pool_crud.create_job(node_pool_name="test-pool")
+
+        # Verify
+        self.assertTrue(result)
+
+    def test_create_job_failure(self):
+        """Test job creation failure"""
+        # Setup
+        mock_k8s_client = mock.MagicMock()
+        self.mock_aks_client.k8s_client = mock_k8s_client
+        # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop
+        mock_k8s_client.create_template.return_value = "apiVersion: batch/v1\nkind: Job\n"
+        mock_k8s_client.wait_for_condition.return_value = False
+
+        # Execute
+        result = self.node_pool_crud.create_job(node_pool_name="test-pool")
+
+        # Verify
+        self.assertFalse(result)
+
+    def test_create_job_no_client(self):
+        """Test job creation with no Kubernetes client"""
+        # Setup
+        self.mock_aks_client.k8s_client = None
+
+        # Execute
+        result = self.node_pool_crud.create_job(node_pool_name="test-pool")
+
+        # Verify
+        self.assertFalse(result)
+
+    def test_create_job_partial_success(self):
+        """Test job creation when some jobs succeed and others fail"""
+        # Setup
+        mock_k8s_client = mock.MagicMock()
+        self.mock_aks_client.k8s_client = mock_k8s_client
+
+        # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop
+        mock_k8s_client.create_template.return_value = "apiVersion: batch/v1\nkind: Job\n"
+
+        # Simulate: job 1 succeeds, job 2 fails, job 3 succeeds
+        # wait_for_condition returns True/False for each job
+        mock_k8s_client.wait_for_condition.side_effect = [True, False, True]
+
+        # Execute - request 3 jobs
+        result = self.node_pool_crud.create_job(
+            node_pool_name="test-pool",
+            number_of_jobs=3,
+            completions=5
+        )
+
+        # Verify - should return False (not all jobs succeeded)
+        self.assertFalse(result)
+
+        # Verify wait_for_condition was called 3 times (once per job)
+        self.assertEqual(mock_k8s_client.wait_for_condition.call_count, 3)
+
+        # Verify create_template was called 3 times (attempted all jobs)
+        self.assertEqual(mock_k8s_client.create_template.call_count, 3)
+
 if __name__ == "__main__":
     unittest.main()

From c2cf983f67436e64beb6ec30a8511858799de6ff Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com>
Date: Tue, 5 May 2026 15:31:17 -0400
Subject: [PATCH 04/14] fix: resolve trailing whitespace and yamllint issues

---
 modules/python/clients/kubernetes_client.py    | 2 +-
 modules/python/crud/workload_templates/job.yml | 2 +-
 steps/engine/crud/k8s/execute.yml              | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/python/clients/kubernetes_client.py b/modules/python/clients/kubernetes_client.py
index 9251351f5e..d85202b50f 100644
--- a/modules/python/clients/kubernetes_client.py
+++ b/modules/python/clients/kubernetes_client.py
@@ -1050,7 +1050,7 @@ def _check_resource_condition(self, resource_type: str, resource_name: str, cond
 
             if resource_type_lower in ['deployment', 'deployments']:
                 return self._check_deployment_condition(resource_name, condition_type, namespace, wait_all)
-            
+
             if resource_type_lower in ['job', 'jobs']:
                 return self._check_job_condition(resource_name, condition_type, namespace, wait_all)
 
diff --git a/modules/python/crud/workload_templates/job.yml b/modules/python/crud/workload_templates/job.yml
index d9627dfcb5..4fc6642bff 100644
--- a/modules/python/crud/workload_templates/job.yml
+++ b/modules/python/crud/workload_templates/job.yml
@@ -17,4 +17,4 @@ spec:
           image: mcr.microsoft.com/oss/nginx/nginx:1.21.6
           command: ["nginx", "-t"]
           ports:
-            - containerPort: 80
\ No newline at end of file
+            - containerPort: 80
diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml
index 7adf75f88e..f7927f063e 100644
--- a/steps/engine/crud/k8s/execute.yml
+++ b/steps/engine/crud/k8s/execute.yml
@@ -115,7 +115,6 @@ steps:
     COMPLETIONS: ${{ parameters.completions }}
     MANIFEST_DIR: ${{ parameters.manifest_dir }}
 
-
 - script: |
     set -eo pipefail
 

From 0ce8b35f0ea0e0175c80fd6c6eba9394b13c1d94 Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Thu, 14 May 2026 12:24:22 -0400
Subject: [PATCH 05/14] refactor: align job with deployment patterns from
 merged PR

- Extract _apply_job helper (matches _apply_deployment pattern)
- Use os.path for default template path instead of hardcoded string
- Use per-job labels to avoid selector collision
- Remove redundant outer try/except
- Use workload_common_parser for shared args (--count, --manifest-dir, etc.)
- Add hasattr guard for cloud provider compatibility
- Use args.count instead of args.number_of_jobs
- Rename subcommand from 'jobs' to 'job' (matches K8s resource type)
- Update pipeline YAML to use count parameter
---
 modules/python/crud/azure/node_pool_crud.py  | 173 ++++++++++---------
 modules/python/crud/main.py                  |  38 ++--
 steps/engine/crud/k8s/execute.yml            |   7 +-
 steps/topology/k8s-crud-gpu/execute-crud.yml |   1 -
 4 files changed, 112 insertions(+), 107 deletions(-)

diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py
index ff3321a21b..501f1c3ad3 100644
--- a/modules/python/crud/azure/node_pool_crud.py
+++ b/modules/python/crud/azure/node_pool_crud.py
@@ -410,6 +410,69 @@ def create_deployment(
         logger.warning("Created %d/%d deployment(s)", successful_deployments, number_of_deployments)
         return False
 
+    def _apply_job(
+        self,
+        k8s_client,
+        node_pool_name,
+        job_index,
+        completions,
+        manifest_dir,
+        label_selector,
+        namespace
+    ):
+        """Helper for create_job — applies and verifies a single Job."""
+        if manifest_dir:
+            # Use the template path from manifest_dir
+            template_path = f"{manifest_dir}/job.yml"
+        else:
+            # Use default template path (relative to workingDirectory: modules/python)
+            template_path = os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "..", "workload_templates", "job.yml"
+            )
+
+        # Generate job name
+        job_name = f"myapp-{node_pool_name}-{job_index}"
+
+        # Use per-job label to avoid selector collision
+        job_label = f"{label_selector.split('=', 1)[-1]}-{job_index}"
+
+        # Create job template using k8s_client.create_template
+        job_template = k8s_client.create_template(
+            template_path,
+            {
+                "JOB_COMPLETIONS": completions,
+                "NODE_POOL_NAME": node_pool_name,
+                "INDEX": job_index,
+                "LABEL_VALUE": job_label,
+            }
+        )
+
+        # Apply each document in the rendered multi-doc template
+        for doc in yaml.safe_load_all(job_template):
+            if doc:
+                k8s_client.apply_manifest_from_file(manifest_dict=doc, namespace=namespace)
+
+        logger.info("Applied manifest for job %s", job_name)
+
+        # Wait for job to complete (successful job verification)
+        logger.info("Waiting for job %s to complete...", job_name)
+        job_ready = k8s_client.wait_for_condition(
+            resource_type="job",
+            wait_condition_type="complete",
+            resource_name=job_name,
+            namespace=namespace,
+            timeout_seconds=self.step_timeout
+        )
+
+        if not job_ready:
+            raise TimeoutError(
+                f"Job {job_name} failed to complete within timeout"
+            )
+
+        logger.info("Job %s is successfully complete", job_name)
+        logger.info("Successfully created and verified job %d", job_index)
+
     def create_job(
         self,
         node_pool_name,
@@ -424,11 +487,11 @@ def create_job(
 
         Args:
             node_pool_name: Name of the node pool to target
-            job_name: Base name for the jobs
             namespace: Kubernetes namespace (default: "default")
             completions: Number of job completions (default: 1)
             manifest_dir: Directory containing Kubernetes manifest files
             number_of_jobs: Number of jobs to create (default: 1)
+            label_selector: Label selector for pods (default: "app=nginx-container")
 
         Returns:
             True if all job creations were successful, False otherwise
@@ -438,82 +501,38 @@ def create_job(
         logger.info("Job completions: %d", completions)
         logger.info("Using manifest directory: %s", manifest_dir)
 
-        try:
-            # Get Kubernetes client from AKS client
-            k8s_client = self.aks_client.k8s_client
-
-            if not k8s_client:
-                logger.error("Kubernetes client not available")
-                return False
+        # Get Kubernetes client from AKS client
+        k8s_client = self.aks_client.k8s_client
 
-            successful_jobs = 0
-
-            # Loop through number of jobs
-            for job_index in range(1, number_of_jobs + 1):
-                logger.info("Creating job %d/%d", job_index, number_of_jobs)
-
-                try:
-                    if manifest_dir:
-                        # Use the template path from manifest_dir
-                        template_path = f"{manifest_dir}/job.yml"
-                    else:
-                        # Use default template path
-                        template_path = "modules/python/crud/workload_templates/job.yml"
-
-                    # Generate job name
-                    job_name = f"myapp-{node_pool_name}-{job_index}"
-
-                    # Create job template using k8s_client.create_template
-                    job_template = k8s_client.create_template(
-                        template_path,
-                        {
-                            "JOB_COMPLETIONS": completions,
-                            "NODE_POOL_NAME": node_pool_name,
-                            "INDEX": job_index,
-                            "LABEL_VALUE": label_selector.split("=", 1)[-1],
-                        }
-                    )
-
-                    # Apply each document in the rendered multi-doc template
-                    for doc in yaml.safe_load_all(job_template):
-                        if doc:
-                            k8s_client.apply_manifest_from_file(manifest_dict=doc)
-
-                    logger.info("Applied manifest for job %s", job_name)
-
-                    # Wait for job to complete (successful job verification)
-                    logger.info("Waiting for job %s to complete...", job_name)
-                    job_ready = k8s_client.wait_for_condition(
-                        resource_type="job",
-                        wait_condition_type="complete",
-                        resource_name=job_name,
-                        namespace=namespace,
-                        timeout_seconds=self.step_timeout
-                    )
-
-                    if job_ready:
-                        logger.info("Job %s is successfully complete", job_name)
-                        logger.info("Successfully created and verified job %d", job_index)
-                        successful_jobs += 1
-                    else:
-                        logger.error("Job %s failed to complete within timeout", job_name)
-                        continue
-
-                except Exception as e:
-                    logger.error("Failed to create job %d: %s", job_index, e)
-                    # Continue with next job instead of failing completely
-                    continue
-
-            # Check if all jobs were successful
-            if successful_jobs == number_of_jobs:
-                logger.info("Successfully created all %d job(s)", number_of_jobs)
-                return True
-            if successful_jobs > 0:
-                logger.warning("Created %d/%d job(s)", successful_jobs, number_of_jobs)
-                return False
-            logger.error("Failed to create any jobs")
+        if not k8s_client:
+            logger.error("Kubernetes client not available")
             return False
 
-        except Exception as e:
-            logger.error("Failed to create jobs: %s", e)
-            return False
+        successful_jobs = 0
+
+        # Loop through number of jobs
+        for job_index in range(1, number_of_jobs + 1):
+            logger.info("Creating job %d/%d", job_index, number_of_jobs)
+
+            try:
+                self._apply_job(
+                    k8s_client=k8s_client,
+                    node_pool_name=node_pool_name,
+                    job_index=job_index,
+                    completions=completions,
+                    manifest_dir=manifest_dir,
+                    label_selector=label_selector,
+                    namespace=namespace
+                )
+                successful_jobs += 1
+            except Exception as e:
+                logger.error("Failed to create job %d: %s", job_index, e)
+                # Continue with next job instead of failing completely
+                continue
+
+        # Check if all jobs were successful
+        if successful_jobs == number_of_jobs:
+            logger.info("Successfully created all %d job(s)", number_of_jobs)
+            return True
+        logger.warning("Created %d/%d job(s)", successful_jobs, number_of_jobs)
+        return False
diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py
index 34e3055fee..7dbcb6a99a 100644
--- a/modules/python/crud/main.py
+++ b/modules/python/crud/main.py
@@ -167,13 +167,17 @@ def handle_workload_operations(node_pool_crud, args):
             }
 
             result = node_pool_crud.create_deployment(**deploy_kwargs)
-        elif command == "jobs":
+        elif command == "job":
+            if not hasattr(node_pool_crud, 'create_job'):
+                logger.error("Cloud provider does not support job workload operations")
+                return 1
+
             # Prepare job arguments
             job_kwargs = {
                 "node_pool_name": args.node_pool_name,
                 "completions": args.completions,
                 "manifest_dir": args.manifest_dir,
-                "number_of_jobs": args.number_of_jobs,
+                "number_of_jobs": args.count,
                 "label_selector": args.label_selector,
             }
 
@@ -397,35 +401,19 @@ def main():
     )
     deployment_parser.set_defaults(func=handle_workload_operations)
 
-    # Jobs command - add after the "deployment" command parser
-    jobs_parser = subparsers.add_parser(
-        "jobs", parents=[common_parser], help="create jobs"
-    )
-    jobs_parser.add_argument("--node-pool-name", required=True, help="Node pool name")
-    jobs_parser.add_argument(
-        "--number-of-jobs",
-        type=int,
-        default=1,
-        help="Number of jobs"
+    # Job command
+    job_parser = subparsers.add_parser(
+        "job",
+        parents=[common_parser, workload_common_parser],
+        help="create jobs"
     )
-    jobs_parser.add_argument(
+    job_parser.add_argument(
         "--completions",
         type=int,
         default=1,
         help="Number of job completions"
     )
-    jobs_parser.add_argument(
-        "--manifest-dir",
-        required=True,
-        help="Directory containing Kubernetes manifest files for the job"
-    )
-    jobs_parser.add_argument(
-        "--label-selector",
-        default="app=nginx-container",
-        help="Label selector for created job pods (default: app=nginx-container)"
-    )
-
-    jobs_parser.set_defaults(func=handle_workload_operations)
+    job_parser.set_defaults(func=handle_workload_operations)
 
     # Arguments provided, run node pool operations and collect benchmark results
     try:
diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml
index f7927f063e..559ae15c77 100644
--- a/steps/engine/crud/k8s/execute.yml
+++ b/steps/engine/crud/k8s/execute.yml
@@ -10,7 +10,6 @@ parameters:
   step_wait_time: 30
   gpu_node_pool: false
   count: 1
-  number_of_jobs: 1
   completions: 1
   replicas: 10
   manifest_dir: ""
@@ -92,12 +91,12 @@ steps:
     set -eo pipefail
 
     # Deploy Jobs
-    PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" jobs \
+    PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" job \
       --cloud "$CLOUD" \
       --run-id "$RUN_ID" \
       --result-dir "$RESULT_DIR" \
       --node-pool-name "$POOL_NAME" \
-      --number-of-jobs "$NUMBER_OF_JOBS" \
+      --count "$COUNT" \
       --completions "$COMPLETIONS" \
       --manifest-dir "$MANIFEST_DIR" \
       --step-timeout "$STEP_TIME_OUT" \
@@ -111,7 +110,7 @@ steps:
     STEP_TIME_OUT: ${{ parameters.step_time_out }}
     RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID)
     GPU_NODE_POOL: ${{ parameters.gpu_node_pool }}
-    NUMBER_OF_JOBS: ${{ parameters.number_of_jobs }}
+    COUNT: ${{ parameters.count }}
     COMPLETIONS: ${{ parameters.completions }}
     MANIFEST_DIR: ${{ parameters.manifest_dir }}
 
diff --git a/steps/topology/k8s-crud-gpu/execute-crud.yml b/steps/topology/k8s-crud-gpu/execute-crud.yml
index dc6da2b464..7301e9b23f 100644
--- a/steps/topology/k8s-crud-gpu/execute-crud.yml
+++ b/steps/topology/k8s-crud-gpu/execute-crud.yml
@@ -23,7 +23,6 @@ steps:
     gpu_node_pool: $(GPU_NODE_POOL)
     step_wait_time: $(STEP_WAIT_TIME)
     count: $(COUNT)
-    number_of_jobs: $(NUMBER_OF_JOBS)
     completions: $(COMPLETIONS)
     replicas: $(REPLICAS)
     manifest_dir: $(MANIFEST_DIR)

From 5e148291f7c39a00e2ce1813fc40b3669b6e7faa Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Thu, 14 May 2026 12:40:36 -0400
Subject: [PATCH 06/14] fix: gate job step to Azure-only and use conditional
 manifest-dir

- Wrap job pipeline step inside Azure cloud gate (matches deployment)
- Use ${MANIFEST_DIR:+--manifest-dir} conditional (matches deployment pattern)
---
 steps/engine/crud/k8s/execute.yml | 50 +++++++++++++++----------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml
index 559ae15c77..7686202438 100644
--- a/steps/engine/crud/k8s/execute.yml
+++ b/steps/engine/crud/k8s/execute.yml
@@ -87,32 +87,32 @@ steps:
       REPLICAS: ${{ parameters.replicas }}
       MANIFEST_DIR: ${{ parameters.manifest_dir }}
 
-- script: |
-    set -eo pipefail
+  - script: |
+      set -eo pipefail
 
-    # Deploy Jobs
-    PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" job \
-      --cloud "$CLOUD" \
-      --run-id "$RUN_ID" \
-      --result-dir "$RESULT_DIR" \
-      --node-pool-name "$POOL_NAME" \
-      --count "$COUNT" \
-      --completions "$COMPLETIONS" \
-      --manifest-dir "$MANIFEST_DIR" \
-      --step-timeout "$STEP_TIME_OUT" \
-      ${GPU_NODE_POOL:+--gpu-node-pool}
-  displayName: 'Execute K8s Job operations for ${{ parameters.cloud }}'
-  workingDirectory: modules/python
-  env:
-    PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py
-    POOL_NAME: ${{ parameters.pool_name }}
-    CLOUD: ${{ parameters.cloud }}
-    STEP_TIME_OUT: ${{ parameters.step_time_out }}
-    RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID)
-    GPU_NODE_POOL: ${{ parameters.gpu_node_pool }}
-    COUNT: ${{ parameters.count }}
-    COMPLETIONS: ${{ parameters.completions }}
-    MANIFEST_DIR: ${{ parameters.manifest_dir }}
+      # Deploy Jobs
+      PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" job \
+        --cloud "$CLOUD" \
+        --run-id "$RUN_ID" \
+        --result-dir "$RESULT_DIR" \
+        --node-pool-name "$POOL_NAME" \
+        --count "$COUNT" \
+        --completions "$COMPLETIONS" \
+        ${MANIFEST_DIR:+--manifest-dir "$MANIFEST_DIR"} \
+        --step-timeout "$STEP_TIME_OUT" \
+        ${GPU_NODE_POOL:+--gpu-node-pool}
+    displayName: 'Execute K8s Job operations for ${{ parameters.cloud }}'
+    workingDirectory: modules/python
+    env:
+      PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py
+      POOL_NAME: ${{ parameters.pool_name }}
+      CLOUD: ${{ parameters.cloud }}
+      STEP_TIME_OUT: ${{ parameters.step_time_out }}
+      RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID)
+      GPU_NODE_POOL: ${{ parameters.gpu_node_pool }}
+      COUNT: ${{ parameters.count }}
+      COMPLETIONS: ${{ parameters.completions }}
+      MANIFEST_DIR: ${{ parameters.manifest_dir }}
 
 - script: |
     set -eo pipefail

From 3e9ac9f1e7dc9552bf834e7ec15d8ddcb456f1af Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Thu, 14 May 2026 12:45:42 -0400
Subject: [PATCH 07/14] fix: correct docstring to list supported workload types

---
 modules/python/crud/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py
index 7dbcb6a99a..612bd6a53a 100644
--- a/modules/python/crud/main.py
+++ b/modules/python/crud/main.py
@@ -147,7 +147,7 @@ def handle_node_pool_operation(node_pool_crud, args):
         return 1
 
 def handle_workload_operations(node_pool_crud, args):
-    """Handle workload operations (deployment, statefulset, jobs) based on the command"""
+    """Handle workload operations (deployment, job) based on the command"""
     command = args.command
     result = None
 

From 697d6149e8651d315296fd1d91ccac6cadc5141d Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Thu, 14 May 2026 14:07:42 -0400
Subject: [PATCH 08/14] docs: clarify nginx -t command choice in job template

---
 modules/python/crud/workload_templates/job.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/python/crud/workload_templates/job.yml b/modules/python/crud/workload_templates/job.yml
index 4fc6642bff..c3cc782e10 100644
--- a/modules/python/crud/workload_templates/job.yml
+++ b/modules/python/crud/workload_templates/job.yml
@@ -15,6 +15,8 @@ spec:
       containers:
         - name: {{LABEL_VALUE}}
           image: mcr.microsoft.com/oss/nginx/nginx:1.21.6
+          # nginx -t validates config and exits immediately (exit 0).
+          # Fast, deterministic exit so we measure CRUD latency, not workload runtime.
           command: ["nginx", "-t"]
           ports:
             - containerPort: 80

From a4fa1cd6fde5c895bee31d332063d06d3f128d3b Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Thu, 14 May 2026 14:54:40 -0400
Subject: [PATCH 09/14] test: populate pipeline test config for job validation

---
 pipelines/system/new-pipeline-test.yml | 40 +++++++++++++++-----------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml
index 63d55f02d9..11d8eb7252 100644
--- a/pipelines/system/new-pipeline-test.yml
+++ b/pipelines/system/new-pipeline-test.yml
@@ -1,25 +1,33 @@
 trigger: none
 
 variables:
-  SCENARIO_TYPE: <scenario-type>
-  SCENARIO_NAME: <scenario-name>
+  SCENARIO_TYPE: system
+  SCENARIO_NAME: k8s-crud-job-test
 
 stages:
-  - stage: <stage-name> # format: <cloud>[_<region>]+ (e.g. azure_eastus2, aws_eastus_westus)
+  - stage: azure_australiaeast
     dependsOn: []
     jobs:
-      - template: /jobs/competitive-test.yml # must keep as is
+      - template: /jobs/competitive-test.yml
         parameters:
-          cloud: <cloud> # e.g. azure, aws
-          regions: # list of regions
-            - region1 # e.g. eastus2
-          topology: <topology> # e.g. cluster-autoscaler
-          engine: <engine> # e.g. clusterloader2
-          matrix: # list of test parameters to customize the provisioned resources
-            <case-name>:
-              <key1>: <value1>
-              <key2>: <value2>
-          max_parallel: <number of concurrent jobs> # required
-          credential_type: service_connection # required
+          cloud: azure
+          regions:
+            - australiaeast
+          topology: k8s-crud
+          engine: crud
+          matrix:
+            job_crud:
+              pool_name: testpool
+              vm_size: Standard_D4s_v3
+              create_node_count: 0
+              scale_node_count: 3
+              scale_step_size: 1
+              step_time_out: 600
+              step_wait_time: 30
+              count: 1
+              completions: 1
+              manifest_dir: ""
+          max_parallel: 1
+          credential_type: service_connection
           ssh_key_enabled: false
-          timeout_in_minutes: 60 # if not specified, default is 60
+          timeout_in_minutes: 120

From 72f2650e2be11ac3756341c6189310000acc89fa Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Thu, 14 May 2026 15:02:31 -0400
Subject: [PATCH 10/14] fix: use k8s-crud-gpu topology (k8s-crud doesn't exist)

---
 pipelines/system/new-pipeline-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml
index 11d8eb7252..7416b25f72 100644
--- a/pipelines/system/new-pipeline-test.yml
+++ b/pipelines/system/new-pipeline-test.yml
@@ -13,7 +13,7 @@ stages:
           cloud: azure
           regions:
             - australiaeast
-          topology: k8s-crud
+          topology: k8s-crud-gpu
           engine: crud
           matrix:
             job_crud:

From 9fe29a68a85dd4f3cf675032d8366fda30f14045 Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Fri, 15 May 2026 10:26:08 -0400
Subject: [PATCH 11/14] fix: use existing perf-eval scenario for pipeline test

The k8s-gpu-cluster-crud scenario already has terraform inputs.
Custom scenario dirs will be reverted before merge.
---
 pipelines/system/new-pipeline-test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml
index 7416b25f72..9ba03e19c4 100644
--- a/pipelines/system/new-pipeline-test.yml
+++ b/pipelines/system/new-pipeline-test.yml
@@ -1,8 +1,8 @@
 trigger: none
 
 variables:
-  SCENARIO_TYPE: system
-  SCENARIO_NAME: k8s-crud-job-test
+  SCENARIO_TYPE: perf-eval
+  SCENARIO_NAME: k8s-gpu-cluster-crud
 
 stages:
   - stage: azure_australiaeast

From e8fa8f61490d6a7419cc5d18e7ada1abdf5e94d2 Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Fri, 15 May 2026 12:03:50 -0400
Subject: [PATCH 12/14] fix: add gpu_node_pool and replicas to test config, fix
 workload labels

- Add gpu_node_pool: '' to prevent GPU driver install on Standard_D4s_v3
- Add replicas: 10 for deployment step (topology passes it to engine)
- Include workload type in pod labels to prevent future collision:
  - deployment: nginx-container-deployment-{index}
  - job: nginx-container-job-{index}
---
 modules/python/crud/azure/node_pool_crud.py | 8 ++++----
 pipelines/system/new-pipeline-test.yml      | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py
index 501f1c3ad3..006de983f6 100644
--- a/modules/python/crud/azure/node_pool_crud.py
+++ b/modules/python/crud/azure/node_pool_crud.py
@@ -297,8 +297,8 @@ def _apply_deployment(
         # Generate deployment name
         deployment_name = f"myapp-{node_pool_name}-{deployment_index}"
 
-        # Use per-deployment label to avoid selector collision
-        deployment_label = f"{label_selector.split('=', 1)[-1]}-{deployment_index}"
+        # Use per-deployment label to avoid selector collision across workload types
+        deployment_label = f"{label_selector.split('=', 1)[-1]}-deployment-{deployment_index}"
 
         # Create deployment template using k8s_client.create_template
         deployment_template = k8s_client.create_template(
@@ -434,8 +434,8 @@ def _apply_job(
         # Generate job name
         job_name = f"myapp-{node_pool_name}-{job_index}"
 
-        # Use per-job label to avoid selector collision
-        job_label = f"{label_selector.split('=', 1)[-1]}-{job_index}"
+        # Use per-job label to avoid selector collision across workload types
+        job_label = f"{label_selector.split('=', 1)[-1]}-job-{job_index}"
 
         # Create job template using k8s_client.create_template
         job_template = k8s_client.create_template(
diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml
index 9ba03e19c4..e2176ef295 100644
--- a/pipelines/system/new-pipeline-test.yml
+++ b/pipelines/system/new-pipeline-test.yml
@@ -17,6 +17,7 @@ stages:
           engine: crud
           matrix:
             job_crud:
+              gpu_node_pool: ""
               pool_name: testpool
               vm_size: Standard_D4s_v3
               create_node_count: 0
@@ -25,6 +26,7 @@ stages:
               step_time_out: 600
               step_wait_time: 30
               count: 1
+              replicas: 10
               completions: 1
               manifest_dir: ""
           max_parallel: 1

From 7d9c686ab6cb6fef659c1b7e9f487ca7e0c8fea0 Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Fri, 15 May 2026 12:37:54 -0400
Subject: [PATCH 13/14] revert: restore pipeline test placeholder before merge

---
 pipelines/system/new-pipeline-test.yml | 42 ++++++++++----------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml
index e2176ef295..63d55f02d9 100644
--- a/pipelines/system/new-pipeline-test.yml
+++ b/pipelines/system/new-pipeline-test.yml
@@ -1,35 +1,25 @@
 trigger: none
 
 variables:
-  SCENARIO_TYPE: perf-eval
-  SCENARIO_NAME: k8s-gpu-cluster-crud
+  SCENARIO_TYPE: <scenario-type>
+  SCENARIO_NAME: <scenario-name>
 
 stages:
-  - stage: azure_australiaeast
+  - stage: <stage-name> # format: <cloud>[_<region>]+ (e.g. azure_eastus2, aws_eastus_westus)
     dependsOn: []
     jobs:
-      - template: /jobs/competitive-test.yml
+      - template: /jobs/competitive-test.yml # must keep as is
         parameters:
-          cloud: azure
-          regions:
-            - australiaeast
-          topology: k8s-crud-gpu
-          engine: crud
-          matrix:
-            job_crud:
-              gpu_node_pool: ""
-              pool_name: testpool
-              vm_size: Standard_D4s_v3
-              create_node_count: 0
-              scale_node_count: 3
-              scale_step_size: 1
-              step_time_out: 600
-              step_wait_time: 30
-              count: 1
-              replicas: 10
-              completions: 1
-              manifest_dir: ""
-          max_parallel: 1
-          credential_type: service_connection
+          cloud: <cloud> # e.g. azure, aws
+          regions: # list of regions
+            - region1 # e.g. eastus2
+          topology: <topology> # e.g. cluster-autoscaler
+          engine: <engine> # e.g. clusterloader2
+          matrix: # list of test parameters to customize the provisioned resources
+            <case-name>:
+              <key1>: <value1>
+              <key2>: <value2>
+          max_parallel: <number of concurrent jobs> # required
+          credential_type: service_connection # required
           ssh_key_enabled: false
-          timeout_in_minutes: 120
+          timeout_in_minutes: 60 # if not specified, default is 60

From c2c59b6b9349cfbc3192ece33e9cb7c401ff1e15 Mon Sep 17 00:00:00 2001
From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com>
Date: Fri, 15 May 2026 12:47:57 -0400
Subject: [PATCH 14/14] chore: remove inline comment from job template

---
 modules/python/crud/workload_templates/job.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/python/crud/workload_templates/job.yml b/modules/python/crud/workload_templates/job.yml
index c3cc782e10..4fc6642bff 100644
--- a/modules/python/crud/workload_templates/job.yml
+++ b/modules/python/crud/workload_templates/job.yml
@@ -15,8 +15,6 @@ spec:
       containers:
         - name: {{LABEL_VALUE}}
           image: mcr.microsoft.com/oss/nginx/nginx:1.21.6
-          # nginx -t validates config and exits immediately (exit 0).
-          # Fast, deterministic exit so we measure CRUD latency, not workload runtime.
           command: ["nginx", "-t"]
           ports:
             - containerPort: 80