From 8b9f912f00cf7688bdfb6e1aee2c6b766670e18a Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 4 May 2026 17:01:20 -0400 Subject: [PATCH 01/14] feat: add job workload to CRUD module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add create_job() to NodePoolCRUD that deploys K8s Jobs onto node pools. Unlike deployments/statefulsets which run indefinitely, Jobs are run-to-completion workloads — success means the pod terminated cleanly (succeeded > 0), failure raises immediately (no self-healing). - Add 'jobs' subcommand to handle_workload_operations() in main.py with --number-of-jobs and --completions args - Add job.yml workload template with configurable completions and node affinity via label_selector - Add _check_job_condition and _is_job_condition_met to kubernetes_client.py — checks completion_time + succeeded count - Add wait_for_job_completed with 5-min timeout and 30s polling - Job kind support in apply/update/delete manifest methods --- modules/python/clients/kubernetes_client.py | 58 ++++++++++ modules/python/crud/azure/node_pool_crud.py | 108 ++++++++++++++++++ modules/python/crud/main.py | 41 +++++++ .../python/crud/workload_templates/job.yml | 20 ++++ 4 files changed, 227 insertions(+) create mode 100644 modules/python/crud/workload_templates/job.yml diff --git a/modules/python/clients/kubernetes_client.py b/modules/python/clients/kubernetes_client.py index 60838d2ce4..9251351f5e 100644 --- a/modules/python/clients/kubernetes_client.py +++ b/modules/python/clients/kubernetes_client.py @@ -975,6 +975,8 @@ def wait_for_condition(self, resource_type: str, wait_condition_type: str, names valid_conditions = { 'deployment': ['available', 'progressing', 'replicafailure', 'ready'], 'deployments': ['available', 'progressing', 'replicafailure', 'ready'], + 'job': ['complete', 'failed'], + 'jobs': ['complete', 'failed'], # Add more resource types as needed } @@ -1048,6 +1050,9 @@ def _check_resource_condition(self, resource_type: str, resource_name: str, cond if resource_type_lower in ['deployment', 'deployments']: return self._check_deployment_condition(resource_name, condition_type, namespace, wait_all) + + if resource_type_lower in ['job', 'jobs']: + return self._check_job_condition(resource_name, condition_type, namespace, wait_all) logger.warning(f"Unsupported resource type for condition checking: {resource_type}") return False @@ -1097,6 +1102,47 @@ def _is_deployment_condition_met(self, deployment, condition_type: str) -> bool: return False + def _check_job_condition(self, resource_name: str, condition_type: str, namespace: str, wait_all: bool) -> bool: + """Check job condition ('complete' or 'failed').""" + try: + if wait_all or not resource_name: + jobs = self.batch.list_namespaced_job(namespace=namespace).items + else: + job = self.batch.read_namespaced_job(name=resource_name, namespace=namespace) + jobs = [job] + + for job in jobs: + if not self._is_job_condition_met(job, condition_type): + return False + + return True + + except client.rest.ApiException as e: + if e.status == 404: + logger.debug("Job not found, waiting...") + return False + raise e + + def _is_job_condition_met(self, job, condition_type: str) -> bool: + """Check if a job meets the specified condition.""" + if not job.status: + return False + + condition_type_lower = condition_type.lower() + + if condition_type_lower == 'complete': + return bool(job.status.completion_time and job.status.succeeded and job.status.succeeded > 0) + + if condition_type_lower == 'failed': + return bool(job.status.failed and job.status.failed > 0 and not job.status.active) + + if job.status.conditions: + for condition in job.status.conditions: + if condition.type.lower() == condition_type_lower and condition.status == "True": + return True + + return False + def _expand_and_validate_manifests(self, manifests): """ Validate and expand manifests, handling List kind and non-dict manifests. @@ -1159,6 +1205,8 @@ def _apply_single_manifest(self, manifest, namespace=None): self.app.create_namespaced_daemon_set(namespace=namespace, body=manifest) elif kind == "StatefulSet": self.app.create_namespaced_stateful_set(namespace=namespace, body=manifest) + elif kind == "Job": + self.batch.create_namespaced_job(namespace=namespace, body=manifest) elif kind == "Service": self.api.create_namespaced_service(namespace=namespace, body=manifest) elif kind == "ConfigMap": @@ -1313,6 +1361,11 @@ def _update_single_manifest(self, manifest, namespace=None): self.app.patch_namespaced_stateful_set(name=name, namespace=namespace, body=manifest) else: raise ValueError("StatefulSet requires a namespace") + elif kind == "Job": + if namespace: + self.batch.patch_namespaced_job(name=name, namespace=namespace, body=manifest) + else: + raise ValueError("Job requires a namespace") elif kind == "Service": if namespace: self.api.patch_namespaced_service(name=name, namespace=namespace, body=manifest) @@ -1494,6 +1547,11 @@ def _delete_single_manifest(self, manifest, ignore_not_found: bool = True, names self.app.delete_namespaced_stateful_set(name=resource_name, namespace=namespace, body=delete_options) else: raise ValueError("StatefulSet requires a namespace") + elif kind == "Job": + if namespace: + self.batch.delete_namespaced_job(name=resource_name, namespace=namespace, body=delete_options) + else: + raise ValueError("Job requires a namespace") elif kind == "Service": if namespace: self.api.delete_namespaced_service(name=resource_name, namespace=namespace, body=delete_options) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 208589d3d7..ff3321a21b 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -409,3 +409,111 @@ def create_deployment( return True logger.warning("Created %d/%d deployment(s)", successful_deployments, number_of_deployments) return False + + def create_job( + self, + node_pool_name, + completions=1, + manifest_dir=None, + number_of_jobs=1, + label_selector="app=nginx-container", + namespace="default" + ): + """ + Create Kubernetes jobs after node pool operations. + + Args: + node_pool_name: Name of the node pool to target + job_name: Base name for the jobs + namespace: Kubernetes namespace (default: "default") + completions: Number of job completions (default: 1) + manifest_dir: Directory containing Kubernetes manifest files + number_of_jobs: Number of jobs to create (default: 1) + + Returns: + True if all job creations were successful, False otherwise + """ + logger.info("Creating %d job(s)", number_of_jobs) + logger.info("Target node pool: %s", node_pool_name) + logger.info("Job completions: %d", completions) + logger.info("Using manifest directory: %s", manifest_dir) + + try: + # Get Kubernetes client from AKS client + k8s_client = self.aks_client.k8s_client + + if not k8s_client: + logger.error("Kubernetes client not available") + return False + + successful_jobs = 0 + + # Loop through number of jobs + for job_index in range(1, number_of_jobs + 1): + logger.info("Creating job %d/%d", job_index, number_of_jobs) + + try: + if manifest_dir: + # Use the template path from manifest_dir + template_path = f"{manifest_dir}/job.yml" + else: + # Use default template path + template_path = "modules/python/crud/workload_templates/job.yml" + + # Generate job name + job_name = f"myapp-{node_pool_name}-{job_index}" + + # Create job template using k8s_client.create_template + job_template = k8s_client.create_template( + template_path, + { + "JOB_COMPLETIONS": completions, + "NODE_POOL_NAME": node_pool_name, + "INDEX": job_index, + "LABEL_VALUE": label_selector.split("=", 1)[-1], + } + ) + + # Apply each document in the rendered multi-doc template + for doc in yaml.safe_load_all(job_template): + if doc: + k8s_client.apply_manifest_from_file(manifest_dict=doc) + + logger.info("Applied manifest for job %s", job_name) + + # Wait for job to complete (successful job verification) + logger.info("Waiting for job %s to complete...", job_name) + job_ready = k8s_client.wait_for_condition( + resource_type="job", + wait_condition_type="complete", + resource_name=job_name, + namespace=namespace, + timeout_seconds=self.step_timeout + ) + + if job_ready: + logger.info("Job %s is successfully complete", job_name) + logger.info("Successfully created and verified job %d", job_index) + successful_jobs += 1 + else: + logger.error("Job %s failed to complete within timeout", job_name) + continue + + except Exception as e: + logger.error("Failed to create job %d: %s", job_index, e) + # Continue with next job instead of failing completely + continue + + # Check if all jobs were successful + if successful_jobs == number_of_jobs: + logger.info("Successfully created all %d job(s)", number_of_jobs) + return True + if successful_jobs > 0: + logger.warning("Created %d/%d job(s)", successful_jobs, number_of_jobs) + return False + logger.error("Failed to create any jobs") + return False + + except Exception as e: + logger.error("Failed to create jobs: %s", e) + return False diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index 635d14462a..34e3055fee 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -167,6 +167,17 @@ def handle_workload_operations(node_pool_crud, args): } result = node_pool_crud.create_deployment(**deploy_kwargs) + elif command == "jobs": + # Prepare job arguments + job_kwargs = { + "node_pool_name": args.node_pool_name, + "completions": args.completions, + "manifest_dir": args.manifest_dir, + "number_of_jobs": args.number_of_jobs, + "label_selector": args.label_selector, + } + + result = node_pool_crud.create_job(**job_kwargs) else: logger.error("Unknown workload command: '%s'", command) return 1 @@ -386,6 +397,36 @@ def main(): ) deployment_parser.set_defaults(func=handle_workload_operations) + # Jobs command - add after the "deployment" command parser + jobs_parser = subparsers.add_parser( + "jobs", parents=[common_parser], help="create jobs" + ) + jobs_parser.add_argument("--node-pool-name", required=True, help="Node pool name") + jobs_parser.add_argument( + "--number-of-jobs", + type=int, + default=1, + help="Number of jobs" + ) + jobs_parser.add_argument( + "--completions", + type=int, + default=1, + help="Number of job completions" + ) + jobs_parser.add_argument( + "--manifest-dir", + required=True, + help="Directory containing Kubernetes manifest files for the job" + ) + jobs_parser.add_argument( + "--label-selector", + default="app=nginx-container", + help="Label selector for created job pods (default: app=nginx-container)" + ) + + jobs_parser.set_defaults(func=handle_workload_operations) + # Arguments provided, run node pool operations and collect benchmark results try: args = parser.parse_args() diff --git a/modules/python/crud/workload_templates/job.yml b/modules/python/crud/workload_templates/job.yml new file mode 100644 index 0000000000..d9627dfcb5 --- /dev/null +++ b/modules/python/crud/workload_templates/job.yml @@ -0,0 +1,20 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: myapp-{{NODE_POOL_NAME}}-{{INDEX}} + labels: + app: {{LABEL_VALUE}} +spec: + completions: {{JOB_COMPLETIONS}} + template: + metadata: + labels: + app: {{LABEL_VALUE}} + spec: + restartPolicy: Never + containers: + - name: {{LABEL_VALUE}} + image: mcr.microsoft.com/oss/nginx/nginx:1.21.6 + command: ["nginx", "-t"] + ports: + - containerPort: 80 \ No newline at end of file From cf94b1bd3daf335cbbdf52c9c40be64f516b2b89 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 4 May 2026 17:01:29 -0400 Subject: [PATCH 02/14] feat: wire job workload through pipeline topology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add job execution step to the k8s CRUD engine pipeline between deployment and scale-down. Parameters (number_of_jobs, completions) flow from pipeline matrix → topology → engine step → main.py. - Add jobs script block to steps/engine/crud/k8s/execute.yml - Pass number_of_jobs and completions through topology execute-crud.yml - Jobs run after deployment, before scale-down + delete --- steps/engine/crud/k8s/execute.yml | 30 ++++++++++++++++++++ steps/topology/k8s-crud-gpu/execute-crud.yml | 2 ++ 2 files changed, 32 insertions(+) diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml index bd3fbef2e3..7adf75f88e 100644 --- a/steps/engine/crud/k8s/execute.yml +++ b/steps/engine/crud/k8s/execute.yml @@ -10,6 +10,8 @@ parameters: step_wait_time: 30 gpu_node_pool: false count: 1 + number_of_jobs: 1 + completions: 1 replicas: 10 manifest_dir: "" @@ -86,6 +88,34 @@ steps: REPLICAS: ${{ parameters.replicas }} MANIFEST_DIR: ${{ parameters.manifest_dir }} +- script: | + set -eo pipefail + + # Deploy Jobs + PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" jobs \ + --cloud "$CLOUD" \ + --run-id "$RUN_ID" \ + --result-dir "$RESULT_DIR" \ + --node-pool-name "$POOL_NAME" \ + --number-of-jobs "$NUMBER_OF_JOBS" \ + --completions "$COMPLETIONS" \ + --manifest-dir "$MANIFEST_DIR" \ + --step-timeout "$STEP_TIME_OUT" \ + ${GPU_NODE_POOL:+--gpu-node-pool} + displayName: 'Execute K8s Job operations for ${{ parameters.cloud }}' + workingDirectory: modules/python + env: + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py + POOL_NAME: ${{ parameters.pool_name }} + CLOUD: ${{ parameters.cloud }} + STEP_TIME_OUT: ${{ parameters.step_time_out }} + RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID) + GPU_NODE_POOL: ${{ parameters.gpu_node_pool }} + NUMBER_OF_JOBS: ${{ parameters.number_of_jobs }} + COMPLETIONS: ${{ parameters.completions }} + MANIFEST_DIR: ${{ parameters.manifest_dir }} + + - script: | set -eo pipefail diff --git a/steps/topology/k8s-crud-gpu/execute-crud.yml b/steps/topology/k8s-crud-gpu/execute-crud.yml index 1c2b02d9c6..dc6da2b464 100644 --- a/steps/topology/k8s-crud-gpu/execute-crud.yml +++ b/steps/topology/k8s-crud-gpu/execute-crud.yml @@ -23,5 +23,7 @@ steps: gpu_node_pool: $(GPU_NODE_POOL) step_wait_time: $(STEP_WAIT_TIME) count: $(COUNT) + number_of_jobs: $(NUMBER_OF_JOBS) + completions: $(COMPLETIONS) replicas: $(REPLICAS) manifest_dir: $(MANIFEST_DIR) From f13d1f1920b5e0b0469525a478a65ecd08099fd5 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 4 May 2026 17:01:49 -0400 Subject: [PATCH 03/14] test: add unit tests for job CRUD operations Add comprehensive test coverage for create_job and job wait_for_condition: - test_create_job_success: single job completes successfully - test_create_job_failure: job fails to complete - test_create_job_partial_success: continues on individual failures - test_job_wait_for_condition: validates _check_job_condition and _is_job_condition_met for 'complete' and 'failed' states - test_wait_for_job_completed: timeout and polling behavior - Tests cover Job-specific semantics (succeeded count, completion_time, failed + no active pods) --- .../tests/clients/test_kubernetes_client.py | 220 ++++++++++++++++++ .../tests/crud/test_azure_node_pool_crud.py | 70 ++++++ 2 files changed, 290 insertions(+) diff --git a/modules/python/tests/clients/test_kubernetes_client.py b/modules/python/tests/clients/test_kubernetes_client.py index 8190789361..318ca2e13d 100644 --- a/modules/python/tests/clients/test_kubernetes_client.py +++ b/modules/python/tests/clients/test_kubernetes_client.py @@ -2586,6 +2586,36 @@ def test_apply_single_manifest_statefulset_no_namespace(self, mock_create_statef mock_create_statefulset.assert_called_once_with( namespace="default", body=manifest) + @patch('kubernetes.client.BatchV1Api.create_namespaced_job') + def test_apply_single_manifest_job(self, mock_create_job): + """Test _apply_single_manifest with Job resource.""" + manifest = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": {"name": "test-job", "namespace": "test-namespace"}, + "spec": {} + } + + # pylint: disable=protected-access + self.client._apply_single_manifest(manifest) + mock_create_job.assert_called_once_with( + namespace="test-namespace", body=manifest) + + @patch('kubernetes.client.BatchV1Api.create_namespaced_job') + def test_apply_single_manifest_job_no_namespace(self, mock_create_job): + """Test _apply_single_manifest with Job missing namespace uses 'default'.""" + manifest = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": {"name": "test-job"}, + "spec": {} + } + + # pylint: disable=protected-access + self.client._apply_single_manifest(manifest) + mock_create_job.assert_called_once_with( + namespace="default", body=manifest) + @patch('kubernetes.client.CoreV1Api.create_namespaced_service') def test_apply_single_manifest_service(self, mock_create_service): """Test _apply_single_manifest with Service resource.""" @@ -3407,6 +3437,160 @@ def test_wait_for_condition_case_insensitive(self): ) self.assertTrue(result) + # Tests for wait_for_condition with job resource type + + @patch('time.time') + def test_wait_for_condition_job_complete_success(self, mock_time): + """Test wait_for_condition for a completed job - success case""" + mock_time.side_effect = [0, 0, 1, 2, 2] + + mock_job = MagicMock() + mock_job.status.completion_time = "2026-04-20T00:00:00Z" + mock_job.status.succeeded = 1 + mock_job.status.failed = None + mock_job.status.active = None + mock_job.status.conditions = None + + with patch.object(self.client, 'batch') as mock_batch, \ + patch('time.sleep'): + mock_batch.read_namespaced_job.return_value = mock_job + + result = self.client.wait_for_condition( + resource_type="job", + resource_name="test-job", + wait_condition_type="complete", + namespace="test-namespace", + timeout_seconds=5 + ) + + self.assertTrue(result) + mock_batch.read_namespaced_job.assert_called_with( + name="test-job", + namespace="test-namespace" + ) + + @patch('time.time') + def test_wait_for_condition_job_complete_timeout(self, mock_time): + """Test wait_for_condition for a job - timeout when not yet complete""" + mock_time.side_effect = [0, 0, 2, 5, 6, 6] + + mock_job = MagicMock() + mock_job.status.completion_time = None + mock_job.status.succeeded = 0 + mock_job.status.failed = None + mock_job.status.active = 1 + mock_job.status.conditions = None + + with patch.object(self.client, 'batch') as mock_batch, \ + patch('time.sleep'): + mock_batch.read_namespaced_job.return_value = mock_job + + result = self.client.wait_for_condition( + resource_type="job", + resource_name="test-job", + wait_condition_type="complete", + namespace="test-namespace", + timeout_seconds=1 + ) + + self.assertFalse(result) + + @patch('time.time') + def test_wait_for_condition_job_failed_success(self, mock_time): + """Test wait_for_condition for a failed job - detects failure""" + mock_time.side_effect = [0, 0, 1, 2, 2] + + mock_job = MagicMock() + mock_job.status.completion_time = None + mock_job.status.succeeded = 0 + mock_job.status.failed = 3 + mock_job.status.active = 0 + mock_job.status.conditions = None + + with patch.object(self.client, 'batch') as mock_batch, \ + patch('time.sleep'): + mock_batch.read_namespaced_job.return_value = mock_job + + result = self.client.wait_for_condition( + resource_type="job", + resource_name="test-job", + wait_condition_type="failed", + namespace="test-namespace", + timeout_seconds=5 + ) + + self.assertTrue(result) + + @patch('time.time') + def test_wait_for_condition_all_jobs_complete(self, mock_time): + """Test wait_for_condition for all jobs in a namespace - all complete""" + mock_time.side_effect = [0, 0, 1, 2, 2] + + mock_job1 = MagicMock() + mock_job1.status.completion_time = "2026-04-20T00:00:00Z" + mock_job1.status.succeeded = 1 + mock_job1.status.failed = None + mock_job1.status.active = None + mock_job1.status.conditions = None + + mock_job2 = MagicMock() + mock_job2.status.completion_time = "2026-04-20T00:01:00Z" + mock_job2.status.succeeded = 2 + mock_job2.status.failed = None + mock_job2.status.active = None + mock_job2.status.conditions = None + + with patch.object(self.client, 'batch') as mock_batch, \ + patch('time.sleep'): + mock_batch.list_namespaced_job.return_value.items = [mock_job1, mock_job2] + + result = self.client.wait_for_condition( + resource_type="job", + resource_name=None, + wait_condition_type="complete", + namespace="test-namespace", + timeout_seconds=5, + wait_all=True + ) + + self.assertTrue(result) + mock_batch.list_namespaced_job.assert_called_with(namespace="test-namespace") + + @patch('time.time') + def test_wait_for_condition_job_not_found(self, mock_time): + """Test wait_for_condition for a job that doesn't exist - times out""" + mock_time.side_effect = [0, 0, 2, 5, 6, 6] + + api_exception = ApiException(status=404, reason="Not Found") + + with patch.object(self.client, 'batch') as mock_batch, \ + patch('time.sleep'): + mock_batch.read_namespaced_job.side_effect = api_exception + + result = self.client.wait_for_condition( + resource_type="job", + resource_name="nonexistent-job", + wait_condition_type="complete", + namespace="test-namespace", + timeout_seconds=1 + ) + + self.assertFalse(result) + + def test_wait_for_condition_job_invalid_condition(self): + """Test wait_for_condition with invalid condition for job resource""" + with self.assertRaises(ValueError) as context: + self.client.wait_for_condition( + resource_type="job", + resource_name="test-job", + wait_condition_type="available", + namespace="test-namespace", + timeout_seconds=1 + ) + + self.assertIn("Invalid condition 'available' for resource type 'job'", str(context.exception)) + self.assertIn("Valid conditions: complete, failed", str(context.exception)) + # Tests for the enhanced apply_manifest_from_file method with folder support @patch('os.path.isdir') @patch('os.path.isfile') @@ -4020,6 +4204,42 @@ def test_delete_single_manifest_statefulset_no_namespace(self): self.assertIn("StatefulSet requires a namespace", str(context.exception)) + def test_delete_single_manifest_job(self): + """Test deleting a single Job manifest.""" + manifest = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": {"name": "test-job", "namespace": "test-namespace"}, + "spec": {} + } + + with patch.object(self.client, 'batch') as mock_batch: + mock_batch.delete_namespaced_job.return_value = None + + # pylint: disable=protected-access + self.client._delete_single_manifest(manifest) + + mock_batch.delete_namespaced_job.assert_called_once_with( + name="test-job", + namespace="test-namespace", + body=unittest.mock.ANY + ) + + def test_delete_single_manifest_job_no_namespace(self): + """Test _delete_single_manifest with Job missing namespace raises error.""" + manifest = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": {"name": "test-job"}, + "spec": {} + } + + with self.assertRaises(Exception) as context: + # pylint: disable=protected-access + self.client._delete_single_manifest(manifest) + + self.assertIn("Job requires a namespace", str(context.exception)) + def test_delete_single_manifest_service(self): """Test deleting a single service manifest.""" manifest = { diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py index 9064a5af4a..7baabe28df 100644 --- a/modules/python/tests/crud/test_azure_node_pool_crud.py +++ b/modules/python/tests/crud/test_azure_node_pool_crud.py @@ -393,5 +393,75 @@ def test_create_deployment_partial_success(self): # Verify create_template was called 3 times (attempted all deployments) self.assertEqual(mock_k8s_client.create_template.call_count, 3) + def test_create_job_success(self): + """Test successful job creation""" + # Setup + mock_k8s_client = mock.MagicMock() + self.mock_aks_client.k8s_client = mock_k8s_client + # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop + mock_k8s_client.create_template.return_value = "apiVersion: batch/v1\nkind: Job\n" + mock_k8s_client.wait_for_condition.return_value = True + + # Execute + result = self.node_pool_crud.create_job(node_pool_name="test-pool") + + # Verify + self.assertTrue(result) + + def test_create_job_failure(self): + """Test job creation failure""" + # Setup + mock_k8s_client = mock.MagicMock() + self.mock_aks_client.k8s_client = mock_k8s_client + # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop + mock_k8s_client.create_template.return_value = "apiVersion: batch/v1\nkind: Job\n" + mock_k8s_client.wait_for_condition.return_value = False + + # Execute + result = self.node_pool_crud.create_job(node_pool_name="test-pool") + + # Verify + self.assertFalse(result) + + def test_create_job_no_client(self): + """Test job creation with no Kubernetes client""" + # Setup + self.mock_aks_client.k8s_client = None + + # Execute + result = self.node_pool_crud.create_job(node_pool_name="test-pool") + + # Verify + self.assertFalse(result) + + def test_create_job_partial_success(self): + """Test job creation when some jobs succeed and others fail""" + # Setup + mock_k8s_client = mock.MagicMock() + self.mock_aks_client.k8s_client = mock_k8s_client + + # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop + mock_k8s_client.create_template.return_value = "apiVersion: batch/v1\nkind: Job\n" + + # Simulate: job 1 succeeds, job 2 fails, job 3 succeeds + # wait_for_condition returns True/False for each job + mock_k8s_client.wait_for_condition.side_effect = [True, False, True] + + # Execute - request 3 jobs + result = self.node_pool_crud.create_job( + node_pool_name="test-pool", + number_of_jobs=3, + completions=5 + ) + + # Verify - should return False (not all jobs succeeded) + self.assertFalse(result) + + # Verify wait_for_condition was called 3 times (once per job) + self.assertEqual(mock_k8s_client.wait_for_condition.call_count, 3) + + # Verify create_template was called 3 times (attempted all jobs) + self.assertEqual(mock_k8s_client.create_template.call_count, 3) + if __name__ == "__main__": unittest.main() From c2cf983f67436e64beb6ec30a8511858799de6ff Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Tue, 5 May 2026 15:31:17 -0400 Subject: [PATCH 04/14] fix: resolve trailing whitespace and yamllint issues --- modules/python/clients/kubernetes_client.py | 2 +- modules/python/crud/workload_templates/job.yml | 2 +- steps/engine/crud/k8s/execute.yml | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/python/clients/kubernetes_client.py b/modules/python/clients/kubernetes_client.py index 9251351f5e..d85202b50f 100644 --- a/modules/python/clients/kubernetes_client.py +++ b/modules/python/clients/kubernetes_client.py @@ -1050,7 +1050,7 @@ def _check_resource_condition(self, resource_type: str, resource_name: str, cond if resource_type_lower in ['deployment', 'deployments']: return self._check_deployment_condition(resource_name, condition_type, namespace, wait_all) - + if resource_type_lower in ['job', 'jobs']: return self._check_job_condition(resource_name, condition_type, namespace, wait_all) diff --git a/modules/python/crud/workload_templates/job.yml b/modules/python/crud/workload_templates/job.yml index d9627dfcb5..4fc6642bff 100644 --- a/modules/python/crud/workload_templates/job.yml +++ b/modules/python/crud/workload_templates/job.yml @@ -17,4 +17,4 @@ spec: image: mcr.microsoft.com/oss/nginx/nginx:1.21.6 command: ["nginx", "-t"] ports: - - containerPort: 80 \ No newline at end of file + - containerPort: 80 diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml index 7adf75f88e..f7927f063e 100644 --- a/steps/engine/crud/k8s/execute.yml +++ b/steps/engine/crud/k8s/execute.yml @@ -115,7 +115,6 @@ steps: COMPLETIONS: ${{ parameters.completions }} MANIFEST_DIR: ${{ parameters.manifest_dir }} - - script: | set -eo pipefail From 0ce8b35f0ea0e0175c80fd6c6eba9394b13c1d94 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Thu, 14 May 2026 12:24:22 -0400 Subject: [PATCH 05/14] refactor: align job with deployment patterns from merged PR - Extract _apply_job helper (matches _apply_deployment pattern) - Use os.path for default template path instead of hardcoded string - Use per-job labels to avoid selector collision - Remove redundant outer try/except - Use workload_common_parser for shared args (--count, --manifest-dir, etc.) - Add hasattr guard for cloud provider compatibility - Use args.count instead of args.number_of_jobs - Rename subcommand from 'jobs' to 'job' (matches K8s resource type) - Update pipeline YAML to use count parameter --- modules/python/crud/azure/node_pool_crud.py | 173 ++++++++++--------- modules/python/crud/main.py | 38 ++-- steps/engine/crud/k8s/execute.yml | 7 +- steps/topology/k8s-crud-gpu/execute-crud.yml | 1 - 4 files changed, 112 insertions(+), 107 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index ff3321a21b..501f1c3ad3 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -410,6 +410,69 @@ def create_deployment( logger.warning("Created %d/%d deployment(s)", successful_deployments, number_of_deployments) return False + def _apply_job( + self, + k8s_client, + node_pool_name, + job_index, + completions, + manifest_dir, + label_selector, + namespace + ): + """Helper for create_job — applies and verifies a single Job.""" + if manifest_dir: + # Use the template path from manifest_dir + template_path = f"{manifest_dir}/job.yml" + else: + # Use default template path (relative to workingDirectory: modules/python) + template_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", "workload_templates", "job.yml" + ) + + # Generate job name + job_name = f"myapp-{node_pool_name}-{job_index}" + + # Use per-job label to avoid selector collision + job_label = f"{label_selector.split('=', 1)[-1]}-{job_index}" + + # Create job template using k8s_client.create_template + job_template = k8s_client.create_template( + template_path, + { + "JOB_COMPLETIONS": completions, + "NODE_POOL_NAME": node_pool_name, + "INDEX": job_index, + "LABEL_VALUE": job_label, + } + ) + + # Apply each document in the rendered multi-doc template + for doc in yaml.safe_load_all(job_template): + if doc: + k8s_client.apply_manifest_from_file(manifest_dict=doc, namespace=namespace) + + logger.info("Applied manifest for job %s", job_name) + + # Wait for job to complete (successful job verification) + logger.info("Waiting for job %s to complete...", job_name) + job_ready = k8s_client.wait_for_condition( + resource_type="job", + wait_condition_type="complete", + resource_name=job_name, + namespace=namespace, + timeout_seconds=self.step_timeout + ) + + if not job_ready: + raise TimeoutError( + f"Job {job_name} failed to complete within timeout" + ) + + logger.info("Job %s is successfully complete", job_name) + logger.info("Successfully created and verified job %d", job_index) + def create_job( self, node_pool_name, @@ -424,11 +487,11 @@ def create_job( Args: node_pool_name: Name of the node pool to target - job_name: Base name for the jobs namespace: Kubernetes namespace (default: "default") completions: Number of job completions (default: 1) manifest_dir: Directory containing Kubernetes manifest files number_of_jobs: Number of jobs to create (default: 1) + label_selector: Label selector for pods (default: "app=nginx-container") Returns: True if all job creations were successful, False otherwise @@ -438,82 +501,38 @@ def create_job( logger.info("Job completions: %d", completions) logger.info("Using manifest directory: %s", manifest_dir) - try: - # Get Kubernetes client from AKS client - k8s_client = self.aks_client.k8s_client - - if not k8s_client: - logger.error("Kubernetes client not available") - return False + # Get Kubernetes client from AKS client + k8s_client = self.aks_client.k8s_client - successful_jobs = 0 - - # Loop through number of jobs - for job_index in range(1, number_of_jobs + 1): - logger.info("Creating job %d/%d", job_index, number_of_jobs) - - try: - if manifest_dir: - # Use the template path from manifest_dir - template_path = f"{manifest_dir}/job.yml" - else: - # Use default template path - template_path = "modules/python/crud/workload_templates/job.yml" - - # Generate job name - job_name = f"myapp-{node_pool_name}-{job_index}" - - # Create job template using k8s_client.create_template - job_template = k8s_client.create_template( - template_path, - { - "JOB_COMPLETIONS": completions, - "NODE_POOL_NAME": node_pool_name, - "INDEX": job_index, - "LABEL_VALUE": label_selector.split("=", 1)[-1], - } - ) - - # Apply each document in the rendered multi-doc template - for doc in yaml.safe_load_all(job_template): - if doc: - k8s_client.apply_manifest_from_file(manifest_dict=doc) - - logger.info("Applied manifest for job %s", job_name) - - # Wait for job to complete (successful job verification) - logger.info("Waiting for job %s to complete...", job_name) - job_ready = k8s_client.wait_for_condition( - resource_type="job", - wait_condition_type="complete", - resource_name=job_name, - namespace=namespace, - timeout_seconds=self.step_timeout - ) - - if job_ready: - logger.info("Job %s is successfully complete", job_name) - logger.info("Successfully created and verified job %d", job_index) - successful_jobs += 1 - else: - logger.error("Job %s failed to complete within timeout", job_name) - continue - - except Exception as e: - logger.error("Failed to create job %d: %s", job_index, e) - # Continue with next job instead of failing completely - continue - - # Check if all jobs were successful - if successful_jobs == number_of_jobs: - logger.info("Successfully created all %d job(s)", number_of_jobs) - return True - if successful_jobs > 0: - logger.warning("Created %d/%d job(s)", successful_jobs, number_of_jobs) - return False - logger.error("Failed to create any jobs") + if not k8s_client: + logger.error("Kubernetes client not available") return False - except Exception as e: - logger.error("Failed to create jobs: %s", e) - return False + successful_jobs = 0 + + # Loop through number of jobs + for job_index in range(1, number_of_jobs + 1): + logger.info("Creating job %d/%d", job_index, number_of_jobs) + + try: + self._apply_job( + k8s_client=k8s_client, + node_pool_name=node_pool_name, + job_index=job_index, + completions=completions, + manifest_dir=manifest_dir, + label_selector=label_selector, + namespace=namespace + ) + successful_jobs += 1 + except Exception as e: + logger.error("Failed to create job %d: %s", job_index, e) + # Continue with next job instead of failing completely + continue + + # Check if all jobs were successful + if successful_jobs == number_of_jobs: + logger.info("Successfully created all %d job(s)", number_of_jobs) + return True + logger.warning("Created %d/%d job(s)", successful_jobs, number_of_jobs) + return False diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index 34e3055fee..7dbcb6a99a 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -167,13 +167,17 @@ def handle_workload_operations(node_pool_crud, args): } result = node_pool_crud.create_deployment(**deploy_kwargs) - elif command == "jobs": + elif command == "job": + if not hasattr(node_pool_crud, 'create_job'): + logger.error("Cloud provider does not support job workload operations") + return 1 + # Prepare job arguments job_kwargs = { "node_pool_name": args.node_pool_name, "completions": args.completions, "manifest_dir": args.manifest_dir, - "number_of_jobs": args.number_of_jobs, + "number_of_jobs": args.count, "label_selector": args.label_selector, } @@ -397,35 +401,19 @@ def main(): ) deployment_parser.set_defaults(func=handle_workload_operations) - # Jobs command - add after the "deployment" command parser - jobs_parser = subparsers.add_parser( - "jobs", parents=[common_parser], help="create jobs" - ) - jobs_parser.add_argument("--node-pool-name", required=True, help="Node pool name") - jobs_parser.add_argument( - "--number-of-jobs", - type=int, - default=1, - help="Number of jobs" + # Job command + job_parser = subparsers.add_parser( + "job", + parents=[common_parser, workload_common_parser], + help="create jobs" ) - jobs_parser.add_argument( + job_parser.add_argument( "--completions", type=int, default=1, help="Number of job completions" ) - jobs_parser.add_argument( - "--manifest-dir", - required=True, - help="Directory containing Kubernetes manifest files for the job" - ) - jobs_parser.add_argument( - "--label-selector", - default="app=nginx-container", - help="Label selector for created job pods (default: app=nginx-container)" - ) - - jobs_parser.set_defaults(func=handle_workload_operations) + job_parser.set_defaults(func=handle_workload_operations) # Arguments provided, run node pool operations and collect benchmark results try: diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml index f7927f063e..559ae15c77 100644 --- a/steps/engine/crud/k8s/execute.yml +++ b/steps/engine/crud/k8s/execute.yml @@ -10,7 +10,6 @@ parameters: step_wait_time: 30 gpu_node_pool: false count: 1 - number_of_jobs: 1 completions: 1 replicas: 10 manifest_dir: "" @@ -92,12 +91,12 @@ steps: set -eo pipefail # Deploy Jobs - PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" jobs \ + PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" job \ --cloud "$CLOUD" \ --run-id "$RUN_ID" \ --result-dir "$RESULT_DIR" \ --node-pool-name "$POOL_NAME" \ - --number-of-jobs "$NUMBER_OF_JOBS" \ + --count "$COUNT" \ --completions "$COMPLETIONS" \ --manifest-dir "$MANIFEST_DIR" \ --step-timeout "$STEP_TIME_OUT" \ @@ -111,7 +110,7 @@ steps: STEP_TIME_OUT: ${{ parameters.step_time_out }} RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID) GPU_NODE_POOL: ${{ parameters.gpu_node_pool }} - NUMBER_OF_JOBS: ${{ parameters.number_of_jobs }} + COUNT: ${{ parameters.count }} COMPLETIONS: ${{ parameters.completions }} MANIFEST_DIR: ${{ parameters.manifest_dir }} diff --git a/steps/topology/k8s-crud-gpu/execute-crud.yml b/steps/topology/k8s-crud-gpu/execute-crud.yml index dc6da2b464..7301e9b23f 100644 --- a/steps/topology/k8s-crud-gpu/execute-crud.yml +++ b/steps/topology/k8s-crud-gpu/execute-crud.yml @@ -23,7 +23,6 @@ steps: gpu_node_pool: $(GPU_NODE_POOL) step_wait_time: $(STEP_WAIT_TIME) count: $(COUNT) - number_of_jobs: $(NUMBER_OF_JOBS) completions: $(COMPLETIONS) replicas: $(REPLICAS) manifest_dir: $(MANIFEST_DIR) From 5e148291f7c39a00e2ce1813fc40b3669b6e7faa Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Thu, 14 May 2026 12:40:36 -0400 Subject: [PATCH 06/14] fix: gate job step to Azure-only and use conditional manifest-dir - Wrap job pipeline step inside Azure cloud gate (matches deployment) - Use ${MANIFEST_DIR:+--manifest-dir} conditional (matches deployment pattern) --- steps/engine/crud/k8s/execute.yml | 50 +++++++++++++++---------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml index 559ae15c77..7686202438 100644 --- a/steps/engine/crud/k8s/execute.yml +++ b/steps/engine/crud/k8s/execute.yml @@ -87,32 +87,32 @@ steps: REPLICAS: ${{ parameters.replicas }} MANIFEST_DIR: ${{ parameters.manifest_dir }} -- script: | - set -eo pipefail + - script: | + set -eo pipefail - # Deploy Jobs - PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" job \ - --cloud "$CLOUD" \ - --run-id "$RUN_ID" \ - --result-dir "$RESULT_DIR" \ - --node-pool-name "$POOL_NAME" \ - --count "$COUNT" \ - --completions "$COMPLETIONS" \ - --manifest-dir "$MANIFEST_DIR" \ - --step-timeout "$STEP_TIME_OUT" \ - ${GPU_NODE_POOL:+--gpu-node-pool} - displayName: 'Execute K8s Job operations for ${{ parameters.cloud }}' - workingDirectory: modules/python - env: - PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py - POOL_NAME: ${{ parameters.pool_name }} - CLOUD: ${{ parameters.cloud }} - STEP_TIME_OUT: ${{ parameters.step_time_out }} - RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID) - GPU_NODE_POOL: ${{ parameters.gpu_node_pool }} - COUNT: ${{ parameters.count }} - COMPLETIONS: ${{ parameters.completions }} - MANIFEST_DIR: ${{ parameters.manifest_dir }} + # Deploy Jobs + PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" job \ + --cloud "$CLOUD" \ + --run-id "$RUN_ID" \ + --result-dir "$RESULT_DIR" \ + --node-pool-name "$POOL_NAME" \ + --count "$COUNT" \ + --completions "$COMPLETIONS" \ + ${MANIFEST_DIR:+--manifest-dir "$MANIFEST_DIR"} \ + --step-timeout "$STEP_TIME_OUT" \ + ${GPU_NODE_POOL:+--gpu-node-pool} + displayName: 'Execute K8s Job operations for ${{ parameters.cloud }}' + workingDirectory: modules/python + env: + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py + POOL_NAME: ${{ parameters.pool_name }} + CLOUD: ${{ parameters.cloud }} + STEP_TIME_OUT: ${{ parameters.step_time_out }} + RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID) + GPU_NODE_POOL: ${{ parameters.gpu_node_pool }} + COUNT: ${{ parameters.count }} + COMPLETIONS: ${{ parameters.completions }} + MANIFEST_DIR: ${{ parameters.manifest_dir }} - script: | set -eo pipefail From 3e9ac9f1e7dc9552bf834e7ec15d8ddcb456f1af Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Thu, 14 May 2026 12:45:42 -0400 Subject: [PATCH 07/14] fix: correct docstring to list supported workload types --- modules/python/crud/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index 7dbcb6a99a..612bd6a53a 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -147,7 +147,7 @@ def handle_node_pool_operation(node_pool_crud, args): return 1 def handle_workload_operations(node_pool_crud, args): - """Handle workload operations (deployment, statefulset, jobs) based on the command""" + """Handle workload operations (deployment, job) based on the command""" command = args.command result = None From 697d6149e8651d315296fd1d91ccac6cadc5141d Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Thu, 14 May 2026 14:07:42 -0400 Subject: [PATCH 08/14] docs: clarify nginx -t command choice in job template --- modules/python/crud/workload_templates/job.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/python/crud/workload_templates/job.yml b/modules/python/crud/workload_templates/job.yml index 4fc6642bff..c3cc782e10 100644 --- a/modules/python/crud/workload_templates/job.yml +++ b/modules/python/crud/workload_templates/job.yml @@ -15,6 +15,8 @@ spec: containers: - name: {{LABEL_VALUE}} image: mcr.microsoft.com/oss/nginx/nginx:1.21.6 + # nginx -t validates config and exits immediately (exit 0). + # Fast, deterministic exit so we measure CRUD latency, not workload runtime. command: ["nginx", "-t"] ports: - containerPort: 80 From a4fa1cd6fde5c895bee31d332063d06d3f128d3b Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Thu, 14 May 2026 14:54:40 -0400 Subject: [PATCH 09/14] test: populate pipeline test config for job validation --- pipelines/system/new-pipeline-test.yml | 40 +++++++++++++++----------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 63d55f02d9..11d8eb7252 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,25 +1,33 @@ trigger: none variables: - SCENARIO_TYPE: - SCENARIO_NAME: + SCENARIO_TYPE: system + SCENARIO_NAME: k8s-crud-job-test stages: - - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + - stage: azure_australiaeast dependsOn: [] jobs: - - template: /jobs/competitive-test.yml # must keep as is + - template: /jobs/competitive-test.yml parameters: - cloud: # e.g. azure, aws - regions: # list of regions - - region1 # e.g. eastus2 - topology: # e.g. cluster-autoscaler - engine: # e.g. clusterloader2 - matrix: # list of test parameters to customize the provisioned resources - : - : - : - max_parallel: # required - credential_type: service_connection # required + cloud: azure + regions: + - australiaeast + topology: k8s-crud + engine: crud + matrix: + job_crud: + pool_name: testpool + vm_size: Standard_D4s_v3 + create_node_count: 0 + scale_node_count: 3 + scale_step_size: 1 + step_time_out: 600 + step_wait_time: 30 + count: 1 + completions: 1 + manifest_dir: "" + max_parallel: 1 + credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 # if not specified, default is 60 + timeout_in_minutes: 120 From 72f2650e2be11ac3756341c6189310000acc89fa Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Thu, 14 May 2026 15:02:31 -0400 Subject: [PATCH 10/14] fix: use k8s-crud-gpu topology (k8s-crud doesn't exist) --- pipelines/system/new-pipeline-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 11d8eb7252..7416b25f72 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -13,7 +13,7 @@ stages: cloud: azure regions: - australiaeast - topology: k8s-crud + topology: k8s-crud-gpu engine: crud matrix: job_crud: From 9fe29a68a85dd4f3cf675032d8366fda30f14045 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Fri, 15 May 2026 10:26:08 -0400 Subject: [PATCH 11/14] fix: use existing perf-eval scenario for pipeline test The k8s-gpu-cluster-crud scenario already has terraform inputs. Custom scenario dirs will be reverted before merge. --- pipelines/system/new-pipeline-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 7416b25f72..9ba03e19c4 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,8 +1,8 @@ trigger: none variables: - SCENARIO_TYPE: system - SCENARIO_NAME: k8s-crud-job-test + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: k8s-gpu-cluster-crud stages: - stage: azure_australiaeast From e8fa8f61490d6a7419cc5d18e7ada1abdf5e94d2 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Fri, 15 May 2026 12:03:50 -0400 Subject: [PATCH 12/14] fix: add gpu_node_pool and replicas to test config, fix workload labels - Add gpu_node_pool: '' to prevent GPU driver install on Standard_D4s_v3 - Add replicas: 10 for deployment step (topology passes it to engine) - Include workload type in pod labels to prevent future collision: - deployment: nginx-container-deployment-{index} - job: nginx-container-job-{index} --- modules/python/crud/azure/node_pool_crud.py | 8 ++++---- pipelines/system/new-pipeline-test.yml | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 501f1c3ad3..006de983f6 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -297,8 +297,8 @@ def _apply_deployment( # Generate deployment name deployment_name = f"myapp-{node_pool_name}-{deployment_index}" - # Use per-deployment label to avoid selector collision - deployment_label = f"{label_selector.split('=', 1)[-1]}-{deployment_index}" + # Use per-deployment label to avoid selector collision across workload types + deployment_label = f"{label_selector.split('=', 1)[-1]}-deployment-{deployment_index}" # Create deployment template using k8s_client.create_template deployment_template = k8s_client.create_template( @@ -434,8 +434,8 @@ def _apply_job( # Generate job name job_name = f"myapp-{node_pool_name}-{job_index}" - # Use per-job label to avoid selector collision - job_label = f"{label_selector.split('=', 1)[-1]}-{job_index}" + # Use per-job label to avoid selector collision across workload types + job_label = f"{label_selector.split('=', 1)[-1]}-job-{job_index}" # Create job template using k8s_client.create_template job_template = k8s_client.create_template( diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 9ba03e19c4..e2176ef295 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -17,6 +17,7 @@ stages: engine: crud matrix: job_crud: + gpu_node_pool: "" pool_name: testpool vm_size: Standard_D4s_v3 create_node_count: 0 @@ -25,6 +26,7 @@ stages: step_time_out: 600 step_wait_time: 30 count: 1 + replicas: 10 completions: 1 manifest_dir: "" max_parallel: 1 From 7d9c686ab6cb6fef659c1b7e9f487ca7e0c8fea0 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Fri, 15 May 2026 12:37:54 -0400 Subject: [PATCH 13/14] revert: restore pipeline test placeholder before merge --- pipelines/system/new-pipeline-test.yml | 42 ++++++++++---------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index e2176ef295..63d55f02d9 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,35 +1,25 @@ trigger: none variables: - SCENARIO_TYPE: perf-eval - SCENARIO_NAME: k8s-gpu-cluster-crud + SCENARIO_TYPE: + SCENARIO_NAME: stages: - - stage: azure_australiaeast + - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) dependsOn: [] jobs: - - template: /jobs/competitive-test.yml + - template: /jobs/competitive-test.yml # must keep as is parameters: - cloud: azure - regions: - - australiaeast - topology: k8s-crud-gpu - engine: crud - matrix: - job_crud: - gpu_node_pool: "" - pool_name: testpool - vm_size: Standard_D4s_v3 - create_node_count: 0 - scale_node_count: 3 - scale_step_size: 1 - step_time_out: 600 - step_wait_time: 30 - count: 1 - replicas: 10 - completions: 1 - manifest_dir: "" - max_parallel: 1 - credential_type: service_connection + cloud: # e.g. azure, aws + regions: # list of regions + - region1 # e.g. eastus2 + topology: # e.g. cluster-autoscaler + engine: # e.g. clusterloader2 + matrix: # list of test parameters to customize the provisioned resources + : + : + : + max_parallel: # required + credential_type: service_connection # required ssh_key_enabled: false - timeout_in_minutes: 120 + timeout_in_minutes: 60 # if not specified, default is 60 From c2c59b6b9349cfbc3192ece33e9cb7c401ff1e15 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+diamondpowell@users.noreply.github.com> Date: Fri, 15 May 2026 12:47:57 -0400 Subject: [PATCH 14/14] chore: remove inline comment from job template --- modules/python/crud/workload_templates/job.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/python/crud/workload_templates/job.yml b/modules/python/crud/workload_templates/job.yml index c3cc782e10..4fc6642bff 100644 --- a/modules/python/crud/workload_templates/job.yml +++ b/modules/python/crud/workload_templates/job.yml @@ -15,8 +15,6 @@ spec: containers: - name: {{LABEL_VALUE}} image: mcr.microsoft.com/oss/nginx/nginx:1.21.6 - # nginx -t validates config and exits immediately (exit 0). - # Fast, deterministic exit so we measure CRUD latency, not workload runtime. command: ["nginx", "-t"] ports: - containerPort: 80