@@ -37,6 +37,9 @@ export function useAgentHealth(options: HealthMonitorOptions = {}) {
3737 const healthMetricsRef = useRef ( healthMetrics ) ;
3838 const onStatusChangeRef = useRef ( onStatusChange ) ;
3939
40+ // RACE CONDITION FIX: Track ongoing checks to prevent duplicate requests
41+ const ongoingChecksRef = useRef < Set < string > > ( new Set ( ) ) ;
42+
4043 // PERFORMANCE FIX: Pause polling when tab is not visible
4144 useEffect ( ( ) => {
4245 const handleVisibilityChange = ( ) => {
@@ -69,6 +72,28 @@ export function useAgentHealth(options: HealthMonitorOptions = {}) {
6972 } , [ ] ) ;
7073
7174 const checkSingleAgent = useCallback ( async ( agent : Agent ) : Promise < AgentHealthMetrics > => {
75+ // RACE CONDITION FIX: Skip if already checking this agent
76+ if ( ongoingChecksRef . current . has ( agent . id ) ) {
77+ // Return cached metrics or a default
78+ const cached = healthMetricsRef . current [ agent . id ] ;
79+ if ( cached ) {
80+ return cached ;
81+ }
82+ // Return a placeholder while check is ongoing
83+ return {
84+ agentId : agent . id ,
85+ isOnline : false ,
86+ responseTime : 0 ,
87+ lastChecked : new Date ( ) ,
88+ consecutiveFailures : 0 ,
89+ uptime : 0 ,
90+ error : 'Check already in progress' ,
91+ } ;
92+ }
93+
94+ // Mark this agent as being checked
95+ ongoingChecksRef . current . add ( agent . id ) ;
96+
7297 const startTime = Date . now ( ) ;
7398 let isOnline = false ;
7499 let error : string | undefined ;
@@ -77,6 +102,9 @@ export function useAgentHealth(options: HealthMonitorOptions = {}) {
77102 isOnline = await checkAgentStatus ( agent . id ) ;
78103 } catch ( err ) {
79104 error = err instanceof Error ? err . message : 'Unknown error' ;
105+ } finally {
106+ // CLEANUP: Always remove from ongoing checks
107+ ongoingChecksRef . current . delete ( agent . id ) ;
80108 }
81109
82110 const responseTime = Date . now ( ) - startTime ;
@@ -93,6 +121,7 @@ export function useAgentHealth(options: HealthMonitorOptions = {}) {
93121 } ;
94122 } , [ checkAgentStatus , calculateUptime ] ) ;
95123
124+ // REFACTORED: Use Promise.allSettled to prevent crashes from single agent failures
96125 const checkAllAgents = useCallback ( async ( ) => {
97126 if ( agents . length === 0 ) {
98127 setIsMonitoring ( false ) ;
@@ -101,18 +130,41 @@ export function useAgentHealth(options: HealthMonitorOptions = {}) {
101130
102131 setIsMonitoring ( true ) ;
103132
104- const results = await Promise . all (
133+ // CRITICAL FIX: Use Promise.allSettled instead of Promise.all
134+ // This prevents the entire health check from failing if one agent fails
135+ const results = await Promise . allSettled (
105136 agents . map ( agent => checkSingleAgent ( agent ) )
106137 ) ;
107138
108139 const newMetrics : Record < string , AgentHealthMetrics > = { } ;
109- results . forEach ( metric => {
110- newMetrics [ metric . agentId ] = metric ;
111140
112- // Trigger status change callback if status changed
113- const previousStatus = healthMetricsRef . current [ metric . agentId ] ?. isOnline ;
114- if ( previousStatus !== undefined && previousStatus !== metric . isOnline && onStatusChangeRef . current ) {
115- onStatusChangeRef . current ( metric . agentId , metric . isOnline ) ;
141+ results . forEach ( ( result , index ) => {
142+ if ( result . status === 'fulfilled' ) {
143+ const metric = result . value ;
144+ newMetrics [ metric . agentId ] = metric ;
145+
146+ // Trigger status change callback if status changed
147+ const previousStatus = healthMetricsRef . current [ metric . agentId ] ?. isOnline ;
148+ if ( previousStatus !== undefined && previousStatus !== metric . isOnline && onStatusChangeRef . current ) {
149+ onStatusChangeRef . current ( metric . agentId , metric . isOnline ) ;
150+ }
151+ } else {
152+ // IMPROVED: Handle rejected promises gracefully
153+ const agent = agents [ index ] ;
154+ console . error ( `Health check failed for agent ${ agent ?. id } :` , result . reason ) ;
155+
156+ // Create a failure metric for the agent
157+ if ( agent ) {
158+ newMetrics [ agent . id ] = {
159+ agentId : agent . id ,
160+ isOnline : false ,
161+ responseTime : 0 ,
162+ lastChecked : new Date ( ) ,
163+ consecutiveFailures : ( healthMetricsRef . current [ agent . id ] ?. consecutiveFailures || 0 ) + 1 ,
164+ uptime : 0 ,
165+ error : result . reason instanceof Error ? result . reason . message : 'Health check failed' ,
166+ } ;
167+ }
116168 }
117169 } ) ;
118170
0 commit comments