-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy path048-base-managed_clusters-onboard.sh
More file actions
executable file
·404 lines (322 loc) · 12.5 KB
/
048-base-managed_clusters-onboard.sh
File metadata and controls
executable file
·404 lines (322 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
#!/bin/bash
source "$(dirname "${BASH_SOURCE[0]}")"/utils/log.sh
source "$(dirname "${BASH_SOURCE[0]}")"/utils/context.sh
source "$(dirname "${BASH_SOURCE[0]}")"/utils/check-cluster.sh
CLUSTER_DATA_DIR="data/clusters"
MC_LIST_YAML_FILE=$CLUSTER_DATA_DIR/mc_list.yaml
MC_KUBECONFIG_INDEX_FILE=$CLUSTER_DATA_DIR/mc-kubeconfig-index-file
REGISTERED_FILE=$CLUSTER_DATA_DIR/mc_registered.txt
PLACEHOLDER_TEXT="/path/to/the/real/mc_kubeconfig/file"
MIN_VERSION="1.28.0"
ONBOARDED_CLUSTER_INDEX_FILE="$CLUSTER_DATA_DIR/onboarded-clusters-name-index"
# Configuration for parallel processing
CLUSTERS_ONBOARD_BATCH_SIZE=${CLUSTERS_ONBOARD_BATCH_SIZE:-1}
function init () {
# If the $MC_KUBECONFIG_INDEX_FILE file is NOT completely updated, then stop to proceed.
if grep -q "$PLACEHOLDER_TEXT" "$MC_KUBECONFIG_INDEX_FILE"; then
log error "⚠️ Placeholder text '$PLACEHOLDER_TEXT' found in $MC_KUBECONFIG_INDEX_FILE. Please replace it."
exit 1
fi
use_tmc_sm_context
}
# Clean up registration resources and recreate the config.
function prepare_for_vks() {
namespaces=$(kubectl get ns --no-headers -o custom-columns=":metadata.name" | grep '^svc-tmc-')
for ns in $namespaces; do
log info "Checking namespace: $ns"
# Delete the agentinstall if it exists.
delete_agent_installs "$ns"
# Uninstall the pre-installation.
uninstall_stale_res "$ns"
# Prepare agent config
prepare_agent_config "$ns"
done
}
function prepare_agent_config() {
local ns="$1"
if [[ -z $ns ]]; then
ns=$(kubectl get ns -o custom-columns=":metadata.name" | grep '^svc-tmc-' | head -n 1)
fi
# Delete the agent config it exists.
delete_agent_config "$ns"
# Create agent config.
create_agent_config "$ns"
}
function delete_agent_config() {
local ns="$1"
# Check and delete AgentConfig if exists
if kubectl get agentconfig -n "$ns" &>/dev/null; then
log info "Deleting AgentConfig(s) in $ns..."
kubectl delete agentconfig --all -n "$ns"
else
log info "No AgentConfig in $ns"
fi
}
function delete_agent_installs() {
local ns="$1"
# Check and delete AgentInstall if exists
if kubectl get agentinstall -n "$ns" &>/dev/null; then
log info "🧹 Deleting AgentInstall(s) in $ns..."
kubectl delete agentinstall --all -n "$ns"
else
log info "No AgentInstall in $ns"
fi
}
function create_agent_config() {
local namespace="$1"
local TMC_SM_CONTEXT="tmc-sm"
local ENDPOINT DOMAIN CA_CERTIFICATE
ENDPOINT=$(tanzu context get "$TMC_SM_CONTEXT" | yq .globalOpts.endpoint)
DOMAIN=$(echo "$ENDPOINT" | cut -d':' -f1)
CA_CERTIFICATE=$(openssl s_client -connect $ENDPOINT -showcerts </dev/null 2>/dev/null | openssl x509 -outform PEM)
cat <<EOF | kubectl apply -f -
apiVersion: "installers.tmc.cloud.vmware.com/v1alpha1"
kind: "AgentConfig"
metadata:
name: "tmc-agent-config"
namespace: "$namespace"
spec:
caCerts: |-
$(echo "$CA_CERTIFICATE" | sed 's/^/ /')
allowedHostNames:
- $DOMAIN
EOF
log info "Created AgentConfig tmc-agent-config in $namespace for $DOMAIN"
}
function uninstall_stale_res() {
local namespace="$1"
log info "Apply uninstall operation"
cat <<EOF | kubectl apply -f -
apiVersion: installers.tmc.cloud.vmware.com/v1alpha1
kind: AgentInstall
metadata:
name: tmc-agent-installer-config
namespace: "$namespace"
spec:
operation: UNINSTALL
EOF
wait_for_pods_cleaned $namespace
# Wait until the agent installer config is deleted by the tmc-agent-installer
log info "Wait until the uninstall operation is successfully completed"
kubectl wait --for=delete agentinstall/tmc-agent-installer-config -n $namespace --timeout 3m
}
function wait_for_pods_cleaned() {
local namespace="$1"
TIMEOUT_SECONDS=300 # 5 minutes
SLEEP_INTERVAL=5
log info "Waiting for all pods in namespace '$namespace' to terminate..."
elapsed=0
while true; do
# Get number of pods
pod_count=$(kubectl get pods -n "$namespace" --no-headers 2>/dev/null | grep -v '^tmc-agent-installer-' | wc -l)
# If no pods, break the loop
if [[ "$pod_count" -eq 0 ]]; then
log info "All pods in namespace '$namespace' are gone."
break
fi
if [[ "$elapsed" -ge "$TIMEOUT_SECONDS" ]]; then
log info "Timeout reached. Still $pod_count pod(s) remaining in namespace '$namespace'."
exit 1
fi
log info "$pod_count pod(s) remaining... waited ${elapsed}s"
sleep "$SLEEP_INTERVAL"
((elapsed += SLEEP_INTERVAL))
done
}
wait_for_mc_healthy() {
local mgmt_cluster="$1"
local INTERVAL=10
local TIMEOUT=600
log info "Waiting for the management cluster '$mgmt_cluster' to become healthy"
local start_time
start_time=$(date +%s)
while true; do
local health
health=$(tanzu tmc mc get $mgmt_cluster -o yaml | yq '.status.health // "UNKNOWN"')
# Exit if the management cluster has already been healthy.
if [[ $health == "HEALTHY" ]]; then
log info "Management cluster '$mgmt_cluster' is HEALTHY"
return 0
fi
local current_time
current_time=$(date +%s)
local elapsed=$((current_time - start_time))
if ((elapsed >= TIMEOUT)); then
log info "Timeout reached after $elapsed seconds. Exiting with failure."
return 1
fi
log info "Management cluster healthy status is '$health'. Waiting $INTERVAL seconds..."
sleep "$INTERVAL"
done
}
function onboard_mgmt_clusters() {
# Reset tracking file
: >"$REGISTERED_FILE"
# Iterate through clusters
local total
total=$(yq '.managementClusters | length' $MC_LIST_YAML_FILE)
for ((index=0; index<total; index+=1)); do
local name health
name=$(yq ".managementClusters[$index].fullName.name" $MC_LIST_YAML_FILE)
health=$(yq ".managementClusters[$index].status.health" $MC_LIST_YAML_FILE)
if [ "$health" == "HEALTHY" ] && [[ "$name" != "aks" && "$name" != "eks" && "$name" != "attached" ]]; then
orgId=$(yq ".managementClusters[$index].fullName.orgId" $MC_LIST_YAML_FILE)
file="$CLUSTER_DATA_DIR/mc_${orgId}_${name}.yaml"
log info "Processing mc $name [health=$health]"
# Save cluster data without .status field.
yq "del(.managementClusters[$index].status) | .managementClusters[$index]" $MC_LIST_YAML_FILE >"$file"
# Remove orgId.
yq -i 'del(.fullName.orgId)' $file
# Look up the kubeconfig file path from the provided index file
local KUBECONFIG_PATH
KUBECONFIG_PATH=$(grep "^$name:" "$MC_KUBECONFIG_INDEX_FILE" | awk '{print $2}')
export KUBECONFIG=$KUBECONFIG_PATH
# Register the cluster using cli.
local provider proxy_name image_registry
provider=$(yq .spec.kubernetesProviderType "$file")
proxy_name=$(yq .spec.proxyName "$file")
image_registry=$(yq .spec.imageRegistry "$file")
local MGMT_YAML phase health
MGMT_YAML=$(tanzu tmc mc get $name)
phase=$(echo "$MGMT_YAML" | yq '.status.phase // "UNKNOWN"')
health=$(echo "$MGMT_YAML" | yq '.status.health // "UNKNOWN"')
set -e
if [[ "$health" == "HEALTHY" ]]; then
log info "Management cluster '$name' is HEALTHY, skip onboarding management cluster"
elif [[ $health == "DISCONNECTED" || "$phase" == "PENDING" ]]; then
if [[ $provider == "VMWARE_TANZU_KUBERNETES_GRID_SERVICE" ]]; then
log info "Setup configurations for management cluster $name"
prepare_agent_config
fi
log info "Reregister management cluster '$name'"
tanzu tmc mc reregister "$name" --use-proxy --proxy-name "$proxy_name" --image-registry "$image_registry" --kubeconfig "$KUBECONFIG_PATH"
else
if [[ "$phase" == "READY_TO_ATTACH" ]]; then
log info "Deregister management cluster '$name'"
tanzu tmc mc deregister "$name" --kubeconfig "$KUBECONFIG_PATH" --force
fi
if [[ $provider == "VMWARE_TANZU_KUBERNETES_GRID_SERVICE" ]]; then
# Prepare.
log info "Setup configurations for management cluster $name"
prepare_for_vks
fi
log info "Register management cluster '$name'"
tanzu tmc mc register "$name" -f "$file" --kubeconfig "$KUBECONFIG_PATH"
fi
set +e
if [[ "$health" != "HEALTHY" ]]; then
# Wait for the registered MC is healthy.
if ! wait_for_mc_healthy "$name"; then
log error "Management cluster '$name' is not ready, exit ..."
exit 1
fi
fi
# Track the name of successfully registered management cluster for later use.
if ! grep -qxF "$name" "$REGISTERED_FILE"; then
echo "$name" >>"$REGISTERED_FILE"
fi
fi
done
}
# Strip v prefix and metadata from version (e.g., v1.29.4+ -> 1.29.4)
sanitize_version() {
echo "$1" | sed -E 's/^v//' | cut -d'+' -f1
}
compare_versions() {
# Returns 0 if $1 >= $2
[ "$(printf "%s\n%s" "$2" "$1" | sort -V | head -n1)" = "$2" ]
}
onboard_workload_cluster() {
local wc_file="$1"
local i="$2"
local name mgmt prov group proxy registry
name=$(yq eval ".clusters[$i].fullName.name" "$wc_file")
mgmt=$(yq eval ".clusters[$i].fullName.managementClusterName" "$wc_file")
prov=$(yq eval ".clusters[$i].fullName.provisionerName" "$wc_file")
group=$(yq eval ".clusters[$i].spec.clusterGroupName" "$wc_file")
proxy=$(yq eval ".clusters[$i].spec.proxyName // \"\"" "$wc_file")
registry=$(yq eval ".clusters[$i].spec.imageRegistry // \"\"" "$wc_file")
local version clean_version
version=$(yq eval ".clusters[$i].spec.topology.version" "$wc_file")
clean_version=$(sanitize_version "$version")
if ! compare_versions "$clean_version" "$MIN_VERSION"; then
log warn "Skipping: the version '$version' of cluster '$name' in namespace '$prov' is NOT supported (< v$MIN_VERSION)"
return 0
fi
# Build the command
local cmd
cmd="tanzu tmc mc wc manage \"$name\" -m \"$mgmt\" -p \"$prov\" --cluster-group \"$group\""
[[ -n "$proxy" ]] && cmd+=" --proxy-name \"$proxy\""
[[ -n "$registry" ]] && cmd+=" --image-registry \"$registry\""
log info "Run $cmd"
if ! eval "$cmd"; then
log error "❌ Failed to manage $name, please re-run this script later (required)"
return 1
else
if wait_cluster_ready "$mgmt" "$prov" "$name"; then
onboarded_cluster_name="$mgmt.$prov.$name"
if ! grep -qxF "$onboarded_cluster_name" "$ONBOARDED_CLUSTER_INDEX_FILE"; then
# If it doesn't exist, append it
echo "$onboarded_cluster_name" >>"$ONBOARDED_CLUSTER_INDEX_FILE"
fi
return 0
else
log error "❌ Cluster $name is not ready, please double check it in the TMC-SM and ensure it's ready and then re-run this script again (required)"
return 1
fi
fi
}
function onboard_workload_clusters () {
# Ensure the index file exists before grepping
[ -f "$ONBOARDED_CLUSTER_INDEX_FILE" ] || touch "$ONBOARDED_CLUSTER_INDEX_FILE"
# Read management cluster name from $REGISTERED_FILE.
while IFS= read -r mc_name; do
local wc_file
wc_file="$CLUSTER_DATA_DIR/wc_of_${mc_name}.yaml"
if [[ ! -f "$wc_file" ]]; then
log warn "File $wc_file not found. Skipping..."
continue
fi
# Remove orgId first
yq -i '(.clusters[] | .fullName) |= del(.orgId)' "$wc_file"
# Count how many workload clusters exist in the file
cluster_count=$(yq eval '.clusters | length' "$wc_file")
log info "Processing $cluster_count clusters in parallel batches of $CLUSTERS_ONBOARD_BATCH_SIZE"
local batch_num=0
for ((start=0; start<cluster_count; start+=CLUSTERS_ONBOARD_BATCH_SIZE)); do
batch_num=$((batch_num + 1))
local end=$((start + CLUSTERS_ONBOARD_BATCH_SIZE - 1))
if [[ $end -ge $cluster_count ]]; then
end=$((cluster_count - 1))
fi
log info "Processing batch $batch_num: clusters $((start + 1))-$((end + 1)) of $cluster_count"
local pids=()
for ((i=start; i<=end; i++)); do
# Start the cluster processing in background
onboard_workload_cluster "$wc_file" "$i" &
pids+=($!)
done
local failed_jobs=0
for pid in "${pids[@]}"; do
if ! wait "$pid"; then
failed_jobs=$((failed_jobs + 1))
fi
done
if [[ $failed_jobs -gt 0 ]]; then
log warn "Batch $batch_num completed with $failed_jobs failed jobs out of $((end - start + 1)) total jobs"
else
log info "Batch $batch_num completed successfully"
fi
done
log info "All clusters processing completed for management cluster: $mc_name"
done <$REGISTERED_FILE
log info "All management clusters processing completed"
}
function main() {
init
# Register the management clusters.
onboard_mgmt_clusters
# Manage the workload clusters.
onboard_workload_clusters
}
main