From 2cb191bc89c1a533499c2e5e6ffb0b160941d2aa Mon Sep 17 00:00:00 2001 From: Amandine Souilleux Date: Fri, 17 Apr 2026 16:46:28 +0200 Subject: [PATCH 1/6] [sc-301863] override the Automatic mode with System in all cases --- python-lib/dku_azure/clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-lib/dku_azure/clusters.py b/python-lib/dku_azure/clusters.py index 6c79bf3..dc3957a 100644 --- a/python-lib/dku_azure/clusters.py +++ b/python-lib/dku_azure/clusters.py @@ -303,7 +303,7 @@ def with_availability_zones(self, use_availability_zones): def build(self): agent_pool_profile_params = {} - if self.mode == "Automatic" and self.idx == 0: + if self.mode == "Automatic": agent_pool_profile_params["mode"] = "System" else: agent_pool_profile_params["mode"] = self.mode From ea68aaee4135cb215052dc9149332a2b7dd1971e Mon Sep 17 00:00:00 2001 From: Amandine Souilleux Date: Wed, 29 Apr 2026 18:51:42 +0200 Subject: [PATCH 2/6] determine the node pool mode dynamically --- python-clusters/create-aks-cluster/cluster.py | 13 ++++++++++--- python-lib/dku_azure/clusters.py | 5 +---- python-lib/dku_azure/utils.py | 14 ++++++++++++++ python-runnables/add-node-pool/runnable.py | 4 ++-- 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/python-clusters/create-aks-cluster/cluster.py b/python-clusters/create-aks-cluster/cluster.py index 61e81d4..4f84de1 100644 --- a/python-clusters/create-aks-cluster/cluster.py +++ b/python-clusters/create-aks-cluster/cluster.py @@ -14,7 +14,7 @@ from dku_kube.nvidia_utils import add_gpu_driver_if_needed from dku_azure.auth import get_credentials_from_connection_info, get_credentials_from_connection_infoV2 from dku_azure.clusters import ClusterBuilder -from dku_azure.utils import run_and_process_cloud_error, get_instance_metadata, get_subscription_id +from dku_azure.utils import run_and_process_cloud_error, get_instance_metadata, get_subscription_id, determine_node_pool_mode class MyCluster(Cluster): def __init__(self, cluster_id, cluster_name, config, plugin_config): @@ -306,6 +306,7 @@ def start(self): # Node pools install_gpu_driver = False gpu_node_pools_taints = set() + is_there_system_node_pool = False for idx, node_pool_conf in enumerate(self.config.get("nodePools", [])): node_pool_builder = cluster_builder.get_node_pool_builder() node_pool_builder.with_idx(idx) @@ -328,8 +329,14 @@ def start(self): min_num_nodes=node_pool_conf.get("minNumNodes", None), max_num_nodes=node_pool_conf.get("maxNumNodes", None)) - node_pool_builder.with_mode(mode=node_pool_conf.get("mode", "Automatic"), - system_pods_only=node_pool_conf.get("systemPodsOnly", True)) + input_node_pool_mode = node_pool_conf.get("mode", "Automatic") + applied_system_pods_only = False + applied_node_pool_mode = determine_node_pool_mode(node_pool_conf.get("mode", "Automatic"), is_there_system_node_pool) + if applied_node_pool_mode == "System": + is_there_system_node_pool = True + applied_system_pods_only = node_pool_conf.get("systemPodsOnly", True) + node_pool_builder.with_mode(mode=applied_node_pool_mode, + system_pods_only=applied_system_pods_only) node_pool_builder.with_disk_size_gb(disk_size_gb=node_pool_conf.get("osDiskSizeGb", 0)) node_pool_builder.with_node_labels(node_pool_conf.get("labels", None)) diff --git a/python-lib/dku_azure/clusters.py b/python-lib/dku_azure/clusters.py index dc3957a..ffdc511 100644 --- a/python-lib/dku_azure/clusters.py +++ b/python-lib/dku_azure/clusters.py @@ -303,10 +303,7 @@ def with_availability_zones(self, use_availability_zones): def build(self): agent_pool_profile_params = {} - if self.mode == "Automatic": - agent_pool_profile_params["mode"] = "System" - else: - agent_pool_profile_params["mode"] = self.mode + agent_pool_profile_params["mode"] = self.mode agent_pool_profile_params["name"] = "nodepool{}".format(self.idx) agent_pool_profile_params["type"] = self.agent_pool_type agent_pool_profile_params["vm_size"] = self.vm_size diff --git a/python-lib/dku_azure/utils.py b/python-lib/dku_azure/utils.py index 80c7977..482cd89 100644 --- a/python-lib/dku_azure/utils.py +++ b/python-lib/dku_azure/utils.py @@ -94,3 +94,17 @@ def get_host_network(credentials=None, resource_group=None, connection_info=None logging.info("VNET: {}".format(vnet)) logging.info("SUBNET ID: {}".format(subnet_id)) return vnet, subnet_id + +def is_existing_system_node_pool(existing_node_pool_modes): + is_existing_system_node_pool = False + for node_pool_mode in existing_node_pool_modes: + is_existing_system_node_pool = is_existing_system_node_pool or node_pool_mode == "System" + return is_existing_system_node_pool + +def determine_node_pool_mode(input_node_pool_mode, is_existing_system_node_pool): + if input_node_pool_mode != "Automatic": + return input_node_pool_mode + if is_existing_system_node_pool: + return "User" + else: + return "System" \ No newline at end of file diff --git a/python-runnables/add-node-pool/runnable.py b/python-runnables/add-node-pool/runnable.py index f7edfae..6c538d7 100644 --- a/python-runnables/add-node-pool/runnable.py +++ b/python-runnables/add-node-pool/runnable.py @@ -3,7 +3,7 @@ from dku_utils.cluster import get_cluster_from_dss_cluster from dku_utils.taints import Toleration from dku_azure.clusters import NodePoolBuilder -from dku_azure.utils import run_and_process_cloud_error, get_instance_metadata, get_subscription_id +from dku_azure.utils import run_and_process_cloud_error, get_instance_metadata, get_subscription_id, is_existing_system_node_pool, determine_node_pool_mode from dku_kube.nvidia_utils import add_gpu_driver_if_needed class MyRunnable(Runnable): @@ -97,7 +97,7 @@ def run(self, progress_callback): min_num_nodes=node_pool_config.get("minNumNodes", None), max_num_nodes=node_pool_config.get("maxNumNodes", None)) - node_pool_builder.with_mode(mode=node_pool_config.get("mode", "Automatic"), + node_pool_builder.with_mode(mode=determine_node_pool_mode(node_pool_config.get("mode", "Automatic"), is_existing_system_node_pool([node_pool.mode for node_pool in node_pools])), system_pods_only=node_pool_config.get("systemPodsOnly", True)) node_pool_builder.with_disk_size_gb(disk_size_gb=node_pool_config.get("osDiskSizeGb", 0)) From 79e96955c25690f71454c262d543a77da01ad660 Mon Sep 17 00:00:00 2001 From: Amandine Souilleux Date: Thu, 30 Apr 2026 14:20:21 +0200 Subject: [PATCH 3/6] Automatic -> User always in add node pool macro --- python-runnables/add-node-pool/runnable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python-runnables/add-node-pool/runnable.py b/python-runnables/add-node-pool/runnable.py index 6c538d7..32a2aff 100644 --- a/python-runnables/add-node-pool/runnable.py +++ b/python-runnables/add-node-pool/runnable.py @@ -97,7 +97,9 @@ def run(self, progress_callback): min_num_nodes=node_pool_config.get("minNumNodes", None), max_num_nodes=node_pool_config.get("maxNumNodes", None)) - node_pool_builder.with_mode(mode=determine_node_pool_mode(node_pool_config.get("mode", "Automatic"), is_existing_system_node_pool([node_pool.mode for node_pool in node_pools])), + input_node_pool_mode = node_pool_config.get("mode", "Automatic") + applied_node_pool_mode = "User" if input_node_pool_mode == "Automatic" else input_node_pool_mode + node_pool_builder.with_mode(mode=applied_node_pool_mode, system_pods_only=node_pool_config.get("systemPodsOnly", True)) node_pool_builder.with_disk_size_gb(disk_size_gb=node_pool_config.get("osDiskSizeGb", 0)) From 8c6f7630039f8d2096ca3daa26a72698fa71f57c Mon Sep 17 00:00:00 2001 From: Amandine Souilleux Date: Thu, 30 Apr 2026 14:22:53 +0200 Subject: [PATCH 4/6] add comment and cleanup --- python-runnables/add-node-pool/runnable.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python-runnables/add-node-pool/runnable.py b/python-runnables/add-node-pool/runnable.py index 32a2aff..33b43de 100644 --- a/python-runnables/add-node-pool/runnable.py +++ b/python-runnables/add-node-pool/runnable.py @@ -3,7 +3,7 @@ from dku_utils.cluster import get_cluster_from_dss_cluster from dku_utils.taints import Toleration from dku_azure.clusters import NodePoolBuilder -from dku_azure.utils import run_and_process_cloud_error, get_instance_metadata, get_subscription_id, is_existing_system_node_pool, determine_node_pool_mode +from dku_azure.utils import run_and_process_cloud_error, get_instance_metadata, get_subscription_id from dku_kube.nvidia_utils import add_gpu_driver_if_needed class MyRunnable(Runnable): @@ -98,6 +98,9 @@ def run(self, progress_callback): max_num_nodes=node_pool_config.get("maxNumNodes", None)) input_node_pool_mode = node_pool_config.get("mode", "Automatic") + # The cluster cannot be created without a System node pool (error raised), + # deleting the last System node pool is not possible (error raised), + # so adding an Automatic node pool will always result in adding a User node pool applied_node_pool_mode = "User" if input_node_pool_mode == "Automatic" else input_node_pool_mode node_pool_builder.with_mode(mode=applied_node_pool_mode, system_pods_only=node_pool_config.get("systemPodsOnly", True)) From 332f79870b10b464c69089bb1c2a25c80ec7aa70 Mon Sep 17 00:00:00 2001 From: Amandine Souilleux Date: Thu, 30 Apr 2026 14:27:34 +0200 Subject: [PATCH 5/6] cleanup --- python-lib/dku_azure/utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python-lib/dku_azure/utils.py b/python-lib/dku_azure/utils.py index 482cd89..03232ed 100644 --- a/python-lib/dku_azure/utils.py +++ b/python-lib/dku_azure/utils.py @@ -95,12 +95,6 @@ def get_host_network(credentials=None, resource_group=None, connection_info=None logging.info("SUBNET ID: {}".format(subnet_id)) return vnet, subnet_id -def is_existing_system_node_pool(existing_node_pool_modes): - is_existing_system_node_pool = False - for node_pool_mode in existing_node_pool_modes: - is_existing_system_node_pool = is_existing_system_node_pool or node_pool_mode == "System" - return is_existing_system_node_pool - def determine_node_pool_mode(input_node_pool_mode, is_existing_system_node_pool): if input_node_pool_mode != "Automatic": return input_node_pool_mode From 23ecebb23ea29b6601429c35a3a91047adab69ba Mon Sep 17 00:00:00 2001 From: Amandine Souilleux Date: Thu, 7 May 2026 17:05:20 +0200 Subject: [PATCH 6/6] fix setting the taint for system node pools --- python-clusters/create-aks-cluster/cluster.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python-clusters/create-aks-cluster/cluster.py b/python-clusters/create-aks-cluster/cluster.py index 4f84de1..34c3c4d 100644 --- a/python-clusters/create-aks-cluster/cluster.py +++ b/python-clusters/create-aks-cluster/cluster.py @@ -330,13 +330,11 @@ def start(self): max_num_nodes=node_pool_conf.get("maxNumNodes", None)) input_node_pool_mode = node_pool_conf.get("mode", "Automatic") - applied_system_pods_only = False applied_node_pool_mode = determine_node_pool_mode(node_pool_conf.get("mode", "Automatic"), is_there_system_node_pool) if applied_node_pool_mode == "System": is_there_system_node_pool = True - applied_system_pods_only = node_pool_conf.get("systemPodsOnly", True) node_pool_builder.with_mode(mode=applied_node_pool_mode, - system_pods_only=applied_system_pods_only) + system_pods_only=node_pool_conf.get("systemPodsOnly", True)) node_pool_builder.with_disk_size_gb(disk_size_gb=node_pool_conf.get("osDiskSizeGb", 0)) node_pool_builder.with_node_labels(node_pool_conf.get("labels", None))