Skip to content

Commit 1547da4

Browse files
committed
Add watch-pcs-and-etcd helper for monitoring cluster status
The script continuously monitors both Pacemaker (PCS) status and etcd member list by automatically switching between available cluster nodes when connectivity issues occur. It includes intelligent node discovery from inventory files or virsh.
1 parent 4b670e8 commit 1547da4

File tree

1 file changed

+198
-0
lines changed

1 file changed

+198
-0
lines changed

helpers/watch-pcs-and-etcd.sh

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
#!/usr/bin/env bash
2+
# Continuously display the PCS status and the Etcd memberlist by switching between available nodes.
3+
# Assumes you have passwordless sudo configured for the user running this script
4+
# or are running as root.
5+
6+
trap 'echo "Interrupted. Exiting..."; exit 0' SIGINT
7+
8+
# Node discovery functions
9+
get_nodes_from_inventory() {
10+
local inventory_file="inventory.ini"
11+
if [ ! -f "$inventory_file" ]; then
12+
return 1
13+
fi
14+
15+
# Parse ansible inventory for nodes in cluster_nodes section
16+
awk '
17+
/^\[cluster_nodes\]$/ { in_section=1; next }
18+
/^\[/ { in_section=0 }
19+
in_section && /^core@[0-9]/ {
20+
# Extract core@IP from lines like: core@192.168.111.20 ansible_ssh_extra_args=...
21+
split($1, parts, " ")
22+
print parts[1]
23+
}
24+
in_section && /^[0-9]/ {
25+
# Handle lines like: 192.168.111.20 ansible_user=core
26+
split($1, ip, " ")
27+
print "core@" ip[1]
28+
}
29+
' "$inventory_file"
30+
}
31+
32+
get_nodes_from_virsh() {
33+
set -x
34+
# Get VM IPs from virsh - dynamically discover running VMs
35+
local nodes=()
36+
37+
# Get list of running VMs
38+
local vms
39+
if ! mapfile -t vms< <(virsh list --state-running --name 2>/dev/null); then
40+
return 1
41+
fi
42+
43+
# For each running VM, try to get its IP
44+
for vm in "${vms[@]}"; do
45+
[ -n "$vm" ] || continue # Skip empty lines
46+
47+
local ip
48+
local vm_ip_found=false
49+
50+
# First try domifaddr (works with default network DHCP)
51+
if ip=$(virsh domifaddr "$vm" 2>/dev/null | awk '/ipv4/ {print $4}' | cut -d'/' -f1 | head -1); then
52+
if [ -n "$ip" ]; then
53+
nodes+=("core@$ip")
54+
vm_ip_found=true
55+
fi
56+
fi
57+
58+
# If domifaddr fails or no IP found yet, try ARP for custom bridge networks
59+
if [ "$vm_ip_found" = false ]; then
60+
# Get MAC addresses for the tnfbm bridge network, which dev-scripts usually creates
61+
if ! mac=$(virsh domiflist "$vm" 2>/dev/null | awk '/tnfbm/ {print $5}'); then
62+
# could not get interface
63+
break
64+
elif [ -z "$mac" ]; then
65+
# could not get MAC address
66+
break
67+
else
68+
# Look up IP via ARP table using MAC address
69+
if ip=$(arp -a | grep -i "$mac" | awk '{print $2}' | tr -d '()' | head -1); then
70+
if [ -n "$ip" ] && [[ "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
71+
# Validate that this IP is actually reachable and belongs to a CoreOS node
72+
if timeout 3 ssh -o ConnectTimeout=2 -o BatchMode=yes "core@$ip" -- echo "ping" >/dev/null 2>&1; then
73+
nodes+=("core@$ip")
74+
fi
75+
fi
76+
fi
77+
fi
78+
fi
79+
done
80+
81+
if [ ${#nodes[@]} -gt 0 ]; then
82+
printf '%s\n' "${nodes[@]}"
83+
else
84+
return 1
85+
fi
86+
}
87+
88+
discover_nodes() {
89+
# Try inventory file first, then virsh, then fallback to hostnames
90+
if get_nodes_from_inventory 2>/dev/null; then
91+
echo "# Using nodes from inventory.ini" >&2
92+
elif get_nodes_from_virsh 2>/dev/null; then
93+
echo "# Using nodes discovered via virsh" >&2
94+
else
95+
echo "! Could not discover nodes" >&2
96+
exit 1
97+
fi
98+
}
99+
100+
# Initialize nodes array
101+
echo "Discovering cluster nodes..."
102+
mapfile -t NODES < <(discover_nodes)
103+
echo "Found ${#NODES[@]} nodes: ${NODES[*]}"
104+
working_node="${NODES[0]}"
105+
106+
get_other_node() {
107+
local current=$1
108+
for i in "${!NODES[@]}"; do
109+
if [ "${NODES[i]}" = "$current" ]; then
110+
local next_index=$(( (i + 1) % ${#NODES[@]} ))
111+
echo "${NODES[next_index]}"
112+
return 0
113+
fi
114+
done
115+
# Fallback if not found
116+
echo "${NODES[0]}"
117+
}
118+
119+
test_etcdctl_command() {
120+
local node=$1
121+
local timeout=3
122+
123+
# Test if etcdctl command works on this node
124+
timeout "$timeout" ssh -o ConnectTimeout=2 "$node" -- \
125+
sudo podman exec etcd etcdctl member list --command-timeout=1s >/dev/null 2>&1
126+
return $?
127+
}
128+
129+
watch_cluster() {
130+
local retry_count=0
131+
local max_retries=6
132+
local max_failures=3
133+
134+
while true; do
135+
echo "Getting Etcd member list from $working_node..."
136+
137+
# We try to use the node where Etcdctl is able to respond
138+
if test_etcdctl_command "$working_node"; then
139+
echo "Starting watch on $working_node (press Ctrl+C to exit)..."
140+
141+
# Cannot use `watch`, it won't be able to detect some command failures and it will
142+
# make us stuck instead than move to the other node.
143+
while true; do
144+
# Capture output first to minimize screen flashing
145+
local output_etcd
146+
local header_etcd
147+
local output_pcs
148+
local header_pcs
149+
local separator="========================================"
150+
151+
header_etcd="Etcd member list from $working_node ($(date)):"
152+
header_pcs="Pacemaker status from $working_node ($(date)):"
153+
154+
if ! output_pcs=$(ssh -o ConnectTimeout=3 "$working_node" -- sudo pcs status 2>/dev/null); then
155+
clear
156+
echo "$header_pcs: Command failed on $working_node, switching nodes..."
157+
break
158+
fi
159+
160+
if ! output_etcd=$(ssh -o ConnectTimeout=3 "$working_node" -- \
161+
sudo podman exec etcd etcdctl member list "$@" --command-timeout=2s 2>/dev/null); then
162+
clear
163+
echo "$header_etcd: Command failed on $working_node, switching nodes..."
164+
break
165+
fi
166+
# Success - quickly clear and display
167+
clear
168+
169+
echo -e "\n$header_pcs"
170+
echo "$separator"
171+
echo -e "$output_pcs\n"
172+
173+
echo -e "\n$header_etcd"
174+
echo "$separator"
175+
echo "$output_etcd"
176+
177+
sleep 5
178+
done
179+
else
180+
echo "Command test failed on $working_node, switching to other node..."
181+
fi
182+
183+
# Switch to other node
184+
working_node=$(get_other_node "$working_node")
185+
retry_count=$((retry_count + 1))
186+
187+
if [ $retry_count -ge $max_retries ]; then
188+
echo "All nodes failed after $max_retries attempts. Waiting 10 seconds before retrying..."
189+
sleep 10
190+
retry_count=0
191+
else
192+
sleep 1
193+
fi
194+
done
195+
}
196+
197+
# Run the watch function
198+
watch_cluster "$*"

0 commit comments

Comments
 (0)