Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 49 additions & 8 deletions src/bosh-director/lib/bosh/director/metrics_collector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,33 @@ def initialize(config)
labels: %i[name],
docstring: 'Number of unresponsive agents per deployment',
)

@unhealthy_agents = Prometheus::Client.registry.gauge(
:bosh_unhealthy_agents,
labels: %i[name],
docstring: 'Number of unhealthy agents (job_state == running AND number_of_processes == 0) per deployment',
)
@total_available_agents = Prometheus::Client.registry.gauge(
:bosh_total_available_agents,
labels: %i[name],
docstring: 'Number of total available agents (all agents, no criteria) per deployment',
)
@failing_instances = Prometheus::Client.registry.gauge(
:bosh_failing_instances,
labels: %i[name],
docstring: 'Number of failing instances (job_state == "failing") per deployment',
)
@stopped_instances = Prometheus::Client.registry.gauge(
:bosh_stopped_instances,
labels: %i[name],
docstring: 'Number of instances (job_state == "stopped") per deployment',
)

@unknown_instances = Prometheus::Client.registry.gauge(
:bosh_unknown_instances,
labels: %i[name],
docstring: 'Number of instances with unknown job_state per deployment',
)
@scheduler = Rufus::Scheduler.new
end

Expand Down Expand Up @@ -115,26 +142,40 @@ def populate_metrics
end

def populate_vm_metrics
response = Net::HTTP.get_response('127.0.0.1', '/unresponsive_agents', @config.health_monitor_port)
fetch_and_update_gauge('/unresponsive_agents', @unresponsive_agents)
fetch_and_update_gauge('/unhealthy_agents', @unhealthy_agents)
fetch_and_update_gauge('/total_available_agents', @total_available_agents)
fetch_and_update_gauge('/failing_instances', @failing_instances)
fetch_and_update_gauge('/stopped_instances', @stopped_instances)
fetch_and_update_gauge('/unknown_instances', @unknown_instances)
end

def fetch_and_update_gauge(endpoint, gauge)
response = Net::HTTP.get_response('127.0.0.1', endpoint, @config.health_monitor_port)
return unless response.is_a?(Net::HTTPSuccess)

unresponsive_agent_counts = JSON.parse(response.body)
return unless unresponsive_agent_counts.is_a?(Hash)
begin
deployment_counts = JSON.parse(response.body)
rescue JSON::ParserError => e
@logger.warn("Failed to parse JSON response from #{endpoint}: #{e.message}")
return
end
return unless deployment_counts.is_a?(Hash)

existing_deployment_names = @unresponsive_agents.values.map do |key, _|
existing_deployment_names = gauge.values.map do |key, _|
# The keys within the Prometheus::Client::Metric#values method are actually hashes. So the
# data returned from values looks like:
# { { name: "deployment_a"} => 10, { name: "deployment_b "} => 0, ... }
key[:name]
end

unresponsive_agent_counts.each do |deployment, count|
@unresponsive_agents.set(count, labels: { name: deployment })
deployment_counts.each do |deployment, count|
gauge.set(count, labels: { name: deployment })
end

removed_deployments = existing_deployment_names - unresponsive_agent_counts.keys
removed_deployments = existing_deployment_names - deployment_counts.keys
removed_deployments.each do |deployment|
@unresponsive_agents.set(0, labels: { name: deployment })
gauge.set(0, labels: { name: deployment })
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,17 @@ def tick
allow(resurrector_manager).to receive(:pause_for_all?).and_return(false, true, false)
allow(Api::ConfigManager).to receive(:deploy_config_enabled?).and_return(true, false)
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0))
stub_request(:get, /unhealthy_agents/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0))
stub_request(:get, /total_available_agents/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 3, "good_deployment" => 2))
stub_request(:get, /failing_instances/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0))
stub_request(:get, /stopped_instances/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 0))
stub_request(:get, /unknown_instances/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 1))
end

after do
Expand All @@ -44,6 +54,11 @@ def tick
Prometheus::Client.registry.unregister(:bosh_networks_dynamic_ips_total)
Prometheus::Client.registry.unregister(:bosh_networks_dynamic_free_ips_total)
Prometheus::Client.registry.unregister(:bosh_unresponsive_agents)
Prometheus::Client.registry.unregister(:bosh_unhealthy_agents)
Prometheus::Client.registry.unregister(:bosh_total_available_agents)
Prometheus::Client.registry.unregister(:bosh_failing_instances)
Prometheus::Client.registry.unregister(:bosh_stopped_instances)
Prometheus::Client.registry.unregister(:bosh_unknown_instances)
end

describe '#prep' do
Expand Down Expand Up @@ -293,46 +308,96 @@ def tick
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
end

context 'when the health monitor returns a non 200 response' do
it "emits the number of unhealthy agents for each deployment" do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents)
expect(metric.get(labels: { name: "flaky_deployment" })).to eq(2)
expect(metric.get(labels: { name: "good_deployment" })).to eq(0)
end

it 'emits the number of total available agents for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_total_available_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(3)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(2)
end

it 'emits the failing/stopped/unknown instance metrics for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_failing_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)

metric = Prometheus::Client.registry.get(:bosh_stopped_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)

metric = Prometheus::Client.registry.get(:bosh_unknown_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(1)
end

context "when the health monitor returns a non 200 response" do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
stub_request(:get, "127.0.0.1:12345/unresponsive_agents")
.to_return(status: 404)
stub_request(:get, "127.0.0.1:12345/unhealthy_agents")
.to_return(status: 404)
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents)
expect(metric.values).to be_empty
end
end

context 'when the health monitor returns a non-json response' do
let(:logger) { double(Logging::Logger) }

before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 200, body: JSON.dump('bad response'))
allow(config).to receive(:metrics_server_logger).and_return(logger)
allow(logger).to receive(:info)
stub_request(:get, "127.0.0.1:12345/unresponsive_agents")
.to_return(status: 200, body: "not valid json {")
stub_request(:get, "127.0.0.1:12345/unhealthy_agents")
.to_return(status: 200, body: "not valid json {")
end

it 'does not emit the vm metrics' do
it 'does not emit the vm metrics and logs a warning' do
expect(logger).to receive(:warn).with(/Failed to parse JSON response from \/unresponsive_agents/)
expect(logger).to receive(:warn).with(/Failed to parse JSON response from \/unhealthy_agents/)

metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents)
expect(metric.values).to be_empty
end
end

context 'when a deployment is deleted after metrics are gathered' do
before do
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0))
stub_request(:get, /unhealthy_agents/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0))
metrics_collector.start

stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('good_deployment' => 0))
.to_return(status: 200, body: JSON.dump("good_deployment" => 0))
stub_request(:get, /unhealthy_agents/)
.to_return(status: 200, body: JSON.dump("good_deployment" => 0))
scheduler.tick
end

it 'resets the metrics for the deleted deployment' do
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
expect(metric.get(labels: { name: "flaky_deployment" })).to eq(0)
metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents)
expect(metric.get(labels: { name: "flaky_deployment" })).to eq(0)
end
end
end
Expand Down
2 changes: 2 additions & 0 deletions src/bosh-monitor/lib/bosh/monitor/agent.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ class Agent
attr_reader :id
attr_reader :discovered_at
attr_accessor :updated_at
attr_accessor :job_state
attr_accessor :number_of_processes

ATTRIBUTES = %i[deployment job index instance_id cid].freeze

Expand Down
40 changes: 40 additions & 0 deletions src/bosh-monitor/lib/bosh/monitor/api_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,45 @@ def initialize(heartbeat_interval = 1)
status(503)
end
end

get '/unhealthy_agents' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.unhealthy_agents)
else
status(503)
end
end

get '/total_available_agents' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.total_available_agents)
else
status(503)
end
end

get '/failing_instances' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.failing_instances)
else
status(503)
end
end

get '/stopped_instances' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.stopped_instances)
else
status(503)
end
end

get '/unknown_instances' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.unknown_instances)
else
status(503)
end
end
end
end
6 changes: 5 additions & 1 deletion src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def to_s
end

def to_hash
{
result = {
kind: 'heartbeat',
id: @id,
timestamp: @timestamp.to_i,
Expand All @@ -79,6 +79,10 @@ def to_hash
teams: @teams,
metrics: @metrics.map(&:to_hash),
}
# Include number_of_processes if present in attributes
result[:number_of_processes] = @attributes["number_of_processes"] if @attributes.key?("number_of_processes")

result
end

def to_json(*_args)
Expand Down
56 changes: 55 additions & 1 deletion src/bosh-monitor/lib/bosh/monitor/instance_manager.rb
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,56 @@ def unresponsive_agents
agents_hash
end

def unhealthy_agents
agents_hash = {}
@deployment_name_to_deployments.each do |name, deployment|
agents_hash[name] = deployment.agents.count do |agent|
agent.job_state && agent.job_state == "running" && agent.number_of_processes == 0
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The unhealthy_agents method has redundant logic. Since agent.job_state == "running" already checks for truthy value, the agent.job_state && check is redundant. An agent with a nil job_state would already fail the equality check.

Suggested change
agent.job_state && agent.job_state == "running" && agent.number_of_processes == 0
agent.job_state == "running" && agent.number_of_processes == 0

Copilot uses AI. Check for mistakes.
end
end

agents_hash
end
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method unhealthy_agents is missing a blank line after its definition. All other metric methods in this file have a blank line between them for consistency.

Suggested change
end
end

Copilot uses AI. Check for mistakes.
def failing_instances
agents_hash = {}
@deployment_name_to_deployments.each do |name, deployment|
agents_hash[name] = deployment.agents.count { |agent| agent.job_state == "failing" }
end

agents_hash
end

def stopped_instances
agents_hash = {}
@deployment_name_to_deployments.each do |name, deployment|
agents_hash[name] = deployment.agents.count { |agent| agent.job_state == "stopped" }
end

agents_hash
end

def unknown_instances
agents_hash = {}
@deployment_name_to_deployments.each do |name, deployment|
agents_hash[name] = deployment.agents.count { |agent| agent.job_state.nil? }
end

agents_hash
end

def total_available_agents
agents_hash = {}
@deployment_name_to_deployments.each do |name, deployment|
# Count all agents for the deployment (no additional criteria)
agents_hash[name] = deployment.agents.count
end

# Include unmanaged (rogue) agents in the aggregate under 'unmanaged'
agents_hash['unmanaged'] = @unmanaged_agents.keys.size

agents_hash
end

def analyze_agents
@logger.info('Analyzing agents...')
started = Time.now
Expand Down Expand Up @@ -325,7 +375,11 @@ def on_heartbeat(agent, deployment, message)
message['instance_id'] = agent.instance_id
message['teams'] = deployment ? deployment.teams : []

return if message['instance_id'].nil? || message['job'].nil? || message['deployment'].nil?
# Store job_state and number_of_processes on the agent for unhealthy detection
agent.job_state = message["job_state"]
agent.number_of_processes = message["number_of_processes"]

return if message["instance_id"].nil? || message["job"].nil? || message["deployment"].nil?
end

@processor.process(:heartbeat, message)
Expand Down
11 changes: 11 additions & 0 deletions src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,17 @@ def make_agent(id)
expect(agent.cid).to eq(instance.cid)
expect(agent.instance_id).to eq(instance.id)
end

it "does not modify job_state or number_of_processes when updating instance" do
agent = make_agent("agent_with_instance")
agent.job_state = "running"
agent.number_of_processes = 3

agent.update_instance(instance)

expect(agent.job_state).to eq("running")
expect(agent.number_of_processes).to eq(3)
end
end
end
end
Loading
Loading