From 8b32456e6f5d02519996fef1f3c45d7fb9bb390c Mon Sep 17 00:00:00 2001 From: Mike Schennum Date: Wed, 17 Sep 2025 09:58:51 -0700 Subject: [PATCH 1/5] increase memory to node --- Dockerfile | 2 +- src/main.ts | 30 ++++++++++++++++++++++++++++-- src/server.ts | 19 +++++++++++++++++-- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index a9e6dc79..a8d3ad61 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,4 +13,4 @@ COPY . *.env ./ RUN yarn install --no-progress && \ yarn build-release -CMD node --experimental-json-modules build/main.js +CMD node --max-old-space-size=512 --experimental-json-modules build/main.js diff --git a/src/main.ts b/src/main.ts index 70b015df..58a6c536 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1,5 +1,31 @@ import { connectDB, defaultPostConnect } from './db/index.js' import { createServer } from './server.js' -await connectDB(defaultPostConnect) -await createServer() +process.on('uncaughtException', (error) => { + console.error('Uncaught Exception:', error) + process.exit(1) +}) + +process.on('unhandledRejection', (reason) => { + console.error('Unhandled Rejection:', reason) + process.exit(1) +}) + +process.on('SIGTERM', () => { + console.log('SIGTERM received, shutting down gracefully') + process.exit(0) +}) + +process.on('SIGINT', () => { + console.log('SIGINT received, shutting down gracefully') + process.exit(0) +}) + +try { + await connectDB(defaultPostConnect) + await createServer() + console.log('🚀 Server started successfully') +} catch (error) { + console.error('Failed to start server:', error) + process.exit(1) +} diff --git a/src/server.ts b/src/server.ts index 6eab9995..d6543edf 100644 --- a/src/server.ts +++ b/src/server.ts @@ -49,7 +49,8 @@ export async function createServer (): Promise<{ app: express.Application, serve schema, plugins: [ApolloServerPluginDrainHttpServer({ httpServer })], cache: new InMemoryLRUCache({ - max: 100 + max: 50, + maxSize: 1024 * 1024 * 10 }) }) // server must be started before applying middleware @@ -57,8 +58,22 @@ export async function createServer (): Promise<{ app: express.Application, serve const context = process.env.LOCAL_DEV_BYPASS_AUTH === 'true' ? localDevBypassAuthContext : createContext + app.get('/health', (req, res) => { + const memUsage = process.memoryUsage() + res.json({ + status: 'ok', + timestamp: new Date().toISOString(), + memory: { + rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB`, + heapTotal: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`, + heapUsed: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`, + external: `${Math.round(memUsage.external / 1024 / 1024)}MB` + } + }) + }) + app.use('/', - bodyParser.json({ limit: '10mb' }), + bodyParser.json({ limit: '5mb' }), cors(), express.json(), expressMiddleware(server, { From ef6809a78eedcb3232c281dedfdf88b5934df922 Mon Sep 17 00:00:00 2001 From: Mike Schennum Date: Wed, 17 Sep 2025 10:18:16 -0700 Subject: [PATCH 2/5] add logging for prod crashes --- Dockerfile | 2 +- src/db/index.ts | 11 +- src/db/utils/jobs/migration/SirvClient.ts | 82 ++++++++--- src/main.ts | 34 ++++- src/server.ts | 86 ++++++++++-- src/utils/CircuitBreaker.ts | 120 ++++++++++++++++ src/utils/ErrorMonitor.ts | 163 ++++++++++++++++++++++ 7 files changed, 462 insertions(+), 36 deletions(-) create mode 100644 src/utils/CircuitBreaker.ts create mode 100644 src/utils/ErrorMonitor.ts diff --git a/Dockerfile b/Dockerfile index a8d3ad61..bba34fd3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,4 +13,4 @@ COPY . *.env ./ RUN yarn install --no-progress && \ yarn build-release -CMD node --max-old-space-size=512 --experimental-json-modules build/main.js +CMD node --max-old-space-size=1024 --max-semi-space-size=128 --optimize-for-size --gc-interval=100 --expose-gc --experimental-json-modules build/main.js diff --git a/src/db/index.ts b/src/db/index.ts index e31ae385..59c0c6a2 100644 --- a/src/db/index.ts +++ b/src/db/index.ts @@ -49,10 +49,19 @@ export const connectDB = async (onConnected: () => any = defaultFn): Promise { logger.error('MongoDB connection error', e) - process.exit(1) + // Don't exit immediately, let the app try to reconnect + // process.exit(1) } ) + mongoose.connection.on('disconnected', () => { + logger.warn('MongoDB disconnected. Attempting to reconnect...') + }) + + mongoose.connection.on('reconnected', () => { + logger.info('MongoDB reconnected successfully') + }) + await mongoose.connect( `${scheme}://${user}:${pass}@${server}/${dbName}?authSource=${authDb}&tls=${tlsFlag}&replicaSet=${rsName}`, { autoIndex: true } diff --git a/src/db/utils/jobs/migration/SirvClient.ts b/src/db/utils/jobs/migration/SirvClient.ts index e8dcd246..eb3e15b0 100644 --- a/src/db/utils/jobs/migration/SirvClient.ts +++ b/src/db/utils/jobs/migration/SirvClient.ts @@ -1,4 +1,5 @@ import axios from 'axios' +import { CircuitBreaker, retryWithBackoff } from '../../../../utils/CircuitBreaker' const SIRV_CONFIG = { clientId: process.env.SIRV_CLIENT_ID_RO ?? null, @@ -9,9 +10,27 @@ const client = axios.create({ baseURL: 'https://api.sirv.com/v2', headers: { 'content-type': 'application/json' - } + }, + timeout: 30000 // 30 second timeout }) +// Add axios interceptors for better error handling +client.interceptors.response.use( + response => response, + async error => { + console.error('Sirv API error:', { + status: error.response?.status, + statusText: error.response?.statusText, + data: error.response?.data, + config: { + method: error.config?.method, + url: error.config?.url + } + }) + return await Promise.reject(error) + } +) + const headers = { 'content-type': 'application/json' } @@ -21,6 +40,13 @@ interface TokenParamsType { clientSecret: string | null } +// Circuit breaker for Sirv API calls +const sirvCircuitBreaker = new CircuitBreaker({ + failureThreshold: 3, + resetTimeout: 60000, // 1 minute + monitoringPeriod: 10000 // 10 seconds +}) + const getToken = async (): Promise => { const params: TokenParamsType = { clientId: SIRV_CONFIG.clientId, @@ -28,16 +54,19 @@ const getToken = async (): Promise => { } try { - const res = await client.post( - '/token', - params) + const res = await sirvCircuitBreaker.execute(async () => { + return await retryWithBackoff(async () => { + return await client.post('/token', params) + }, 3, 1000, 5000) + }) if (res.status === 200) { return res.data.token } } catch (e) { - console.error(e) - process.exit(1) + console.error('Failed to get Sirv token after retries:', e) + // Don't exit process - let the app continue without Sirv functionality + return null } return null } @@ -57,22 +86,31 @@ interface FileMetadaata { * @returns */ export const getFileInfo = async (filename: string): Promise => { - const res = await client.get( - '/files/stat?filename=' + encodeURIComponent(filename), - { - headers: { - ...headers, - Authorization: `bearer ${token}` - } - } - ) - - if (res.status === 200) { - const { ctime, mtime } = res.data - return ({ - btime: new Date(ctime), - mtime: new Date(mtime) + try { + const res = await sirvCircuitBreaker.execute(async () => { + return await retryWithBackoff(async () => { + return await client.get( + '/files/stat?filename=' + encodeURIComponent(filename), + { + headers: { + ...headers, + Authorization: `bearer ${token}` + } + } + ) + }, 3, 1000, 5000) }) + + if (res.status === 200) { + const { ctime, mtime } = res.data + return ({ + btime: new Date(ctime), + mtime: new Date(mtime) + }) + } + throw new Error('Sirv API.getFileInfo() error: ' + String(res.statusText)) + } catch (e) { + console.error('Failed to get file info after retries:', e) + throw e } - throw new Error('Sirv API.getFileInfo() error' + res.statusText) } diff --git a/src/main.ts b/src/main.ts index 58a6c536..3ae30c64 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1,14 +1,40 @@ import { connectDB, defaultPostConnect } from './db/index.js' import { createServer } from './server.js' +import { errorMonitor, setupGlobalErrorHandlers } from './utils/ErrorMonitor.js' + +// Setup enhanced error monitoring +setupGlobalErrorHandlers() + +// Enhanced error handling with graceful shutdown +let isShuttingDown = false process.on('uncaughtException', (error) => { console.error('Uncaught Exception:', error) - process.exit(1) + errorMonitor.logError(error, 'UNCAUGHT_EXCEPTION') + + if (!isShuttingDown) { + isShuttingDown = true + // Give some time for cleanup before exiting + setTimeout(() => { + console.log('Final error stats:', errorMonitor.getStats()) + process.exit(1) + }, 5000) + } }) -process.on('unhandledRejection', (reason) => { - console.error('Unhandled Rejection:', reason) - process.exit(1) +process.on('unhandledRejection', (reason, promise) => { + console.error('Unhandled Rejection at:', promise, 'reason:', reason) + const error = reason instanceof Error ? reason : new Error(String(reason)) + errorMonitor.logError(error, 'UNHANDLED_REJECTION', { promise }) + + // Don't exit immediately on unhandled rejections in production + // Log the error and continue running + if (process.env.NODE_ENV !== 'production') { + if (!isShuttingDown) { + isShuttingDown = true + setTimeout(() => process.exit(1), 5000) + } + } }) process.on('SIGTERM', () => { diff --git a/src/server.ts b/src/server.ts index d6543edf..8cb63510 100644 --- a/src/server.ts +++ b/src/server.ts @@ -21,6 +21,7 @@ import localDevBypassAuthPermissions from './auth/local-dev/permissions.js' import MutableOrgDS from './model/MutableOrganizationDataSource.js' import UserDataSource from './model/UserDataSource.js' import BulkImportDataSource from './model/BulkImportDataSource.js' +import { errorMonitor } from './utils/ErrorMonitor.js' /** * Create a GraphQL server @@ -49,26 +50,95 @@ export async function createServer (): Promise<{ app: express.Application, serve schema, plugins: [ApolloServerPluginDrainHttpServer({ httpServer })], cache: new InMemoryLRUCache({ - max: 50, - maxSize: 1024 * 1024 * 10 - }) + max: 100, + maxSize: 1024 * 1024 * 20, // Increased cache size + ttl: 300000 // 5 minutes TTL to prevent memory leaks + }), + // Enhanced error handling + formatError: (formattedError, _error) => { + // Log the error with enhanced monitoring + errorMonitor.logGraphQLError(formattedError, undefined, undefined) + + // Don't expose internal errors in production + if (process.env.NODE_ENV === 'production') { + // Remove stack trace and internal details + if (formattedError.extensions?.exception != null) { + delete (formattedError.extensions.exception as any).stacktrace + } + if (formattedError.message.includes('internal') || + formattedError.message.includes('database')) { + return new Error('Internal server error') + } + } + + return formattedError + } }) // server must be started before applying middleware await server.start() const context = process.env.LOCAL_DEV_BYPASS_AUTH === 'true' ? localDevBypassAuthContext : createContext + // Enhanced health check with memory monitoring app.get('/health', (req, res) => { const memUsage = process.memoryUsage() + const uptime = process.uptime() + + // Check if memory usage is getting too high + const heapUsedMB = Math.round(memUsage.heapUsed / 1024 / 1024) + const heapTotalMB = Math.round(memUsage.heapTotal / 1024 / 1024) + const memoryUsagePercent = (heapUsedMB / heapTotalMB) * 100 + + const status = memoryUsagePercent > 85 ? 'warning' : 'ok' + res.json({ - status: 'ok', + status, timestamp: new Date().toISOString(), + uptime: `${Math.round(uptime / 60)} minutes`, memory: { rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB`, - heapTotal: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`, - heapUsed: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`, - external: `${Math.round(memUsage.external / 1024 / 1024)}MB` - } + heapTotal: `${heapTotalMB}MB`, + heapUsed: `${heapUsedMB}MB`, + external: `${Math.round(memUsage.external / 1024 / 1024)}MB`, + usagePercent: `${Math.round(memoryUsagePercent)}%` + }, + warnings: memoryUsagePercent > 85 ? ['High memory usage detected'] : [] + }) + + // Log warning if memory usage is high + if (memoryUsagePercent > 85) { + console.warn(`High memory usage: ${Math.round(memoryUsagePercent)}% (${heapUsedMB}MB/${heapTotalMB}MB)`) + } + }) + + // Periodic memory cleanup and monitoring + setInterval(() => { + const memUsage = process.memoryUsage() + const heapUsedMB = Math.round(memUsage.heapUsed / 1024 / 1024) + const heapTotalMB = Math.round(memUsage.heapTotal / 1024 / 1024) + const memoryUsagePercent = (heapUsedMB / heapTotalMB) * 100 + + // Force garbage collection if memory usage is high and gc is available + if (memoryUsagePercent > 80 && global.gc != null) { + console.log('Running garbage collection due to high memory usage') + global.gc() + } + + // Log memory stats every 5 minutes + console.log(`Memory usage: ${heapUsedMB}MB/${heapTotalMB}MB (${Math.round(memoryUsagePercent)}%)`) + }, 5 * 60 * 1000) // Every 5 minutes + + // Error monitoring endpoint + app.get('/errors', (req, res) => { + const stats = errorMonitor.getStats() + const healthStatus = errorMonitor.getHealthStatus() + + res.json({ + healthStatus, + totalErrors: stats.totalErrors, + errorsByType: Object.fromEntries(stats.errorsByType), + recentErrors: stats.recentErrors.slice(-10), // Last 10 errors + lastReset: stats.lastReset }) }) diff --git a/src/utils/CircuitBreaker.ts b/src/utils/CircuitBreaker.ts new file mode 100644 index 00000000..aa838c7b --- /dev/null +++ b/src/utils/CircuitBreaker.ts @@ -0,0 +1,120 @@ +/** + * Circuit breaker pattern implementation for handling network failures + */ + +export enum CircuitState { + CLOSED = 'CLOSED', + OPEN = 'OPEN', + HALF_OPEN = 'HALF_OPEN' +} + +export interface CircuitBreakerOptions { + failureThreshold: number + resetTimeout: number + monitoringPeriod: number +} + +export class CircuitBreaker { + private state: CircuitState = CircuitState.CLOSED + private failureCount: number = 0 + private lastFailureTime?: number + private successCount: number = 0 + + constructor ( + private readonly options: CircuitBreakerOptions = { + failureThreshold: 5, + resetTimeout: 60000, // 1 minute + monitoringPeriod: 10000 // 10 seconds + } + ) {} + + async execute(operation: () => Promise): Promise { + if (this.state === CircuitState.OPEN) { + if (this.shouldAttemptReset()) { + this.state = CircuitState.HALF_OPEN + } else { + throw new Error('Circuit breaker is OPEN - operation not allowed') + } + } + + try { + const result = await operation() + this.onSuccess() + return result + } catch (error) { + this.onFailure() + throw error + } + } + + private onSuccess (): void { + this.failureCount = 0 + if (this.state === CircuitState.HALF_OPEN) { + this.state = CircuitState.CLOSED + } + this.successCount++ + } + + private onFailure (): void { + this.failureCount++ + this.lastFailureTime = Date.now() + + if (this.failureCount >= this.options.failureThreshold) { + this.state = CircuitState.OPEN + } + } + + private shouldAttemptReset (): boolean { + return ( + this.lastFailureTime != null && + Date.now() - this.lastFailureTime >= this.options.resetTimeout + ) + } + + getState (): CircuitState { + return this.state + } + + getStats (): { state: CircuitState, failureCount: number, successCount: number, lastFailureTime?: number } { + return { + state: this.state, + failureCount: this.failureCount, + successCount: this.successCount, + lastFailureTime: this.lastFailureTime + } + } +} + +/** + * Retry with exponential backoff + */ +export async function retryWithBackoff ( + operation: () => Promise, + maxRetries: number = 3, + initialDelay: number = 1000, + maxDelay: number = 10000 +): Promise { + let lastError: Error | undefined + + for (let attempt = 1; attempt <= maxRetries; attempt++) { + try { + return await operation() + } catch (error) { + lastError = error as Error + + if (attempt === maxRetries) { + break + } + + const delay = Math.min( + initialDelay * Math.pow(2, attempt - 1), + maxDelay + ) + + console.warn(`Operation failed (attempt ${attempt}/${maxRetries}), retrying in ${delay}ms:`, error) + await new Promise(resolve => setTimeout(resolve, delay)) + } + } + + throw lastError ?? new Error('Operation failed after all retry attempts') +} diff --git a/src/utils/ErrorMonitor.ts b/src/utils/ErrorMonitor.ts new file mode 100644 index 00000000..9081e050 --- /dev/null +++ b/src/utils/ErrorMonitor.ts @@ -0,0 +1,163 @@ +/** + * Enhanced error monitoring and alerting system + */ + +export interface ErrorStats { + totalErrors: number + errorsByType: Map + recentErrors: Array<{ + timestamp: Date + error: string + type: string + stack?: string + }> + lastReset: Date +} + +export class ErrorMonitor { + private stats: ErrorStats = { + totalErrors: 0, + errorsByType: new Map(), + recentErrors: [], + lastReset: new Date() + } + + private readonly maxRecentErrors = 100 + private readonly alertThresholds = { + errorsPerMinute: 10, + totalErrors: 50 + } + + logError (error: Error, type: string = 'unknown', context?: any): void { + this.stats.totalErrors++ + + // Track errors by type + const currentCount = this.stats.errorsByType.get(type) ?? 0 + this.stats.errorsByType.set(type, currentCount + 1) + + // Add to recent errors + this.stats.recentErrors.push({ + timestamp: new Date(), + error: error.message, + type, + stack: error.stack + }) + + // Keep only recent errors + if (this.stats.recentErrors.length > this.maxRecentErrors) { + this.stats.recentErrors = this.stats.recentErrors.slice(-this.maxRecentErrors) + } + + // Log the error with context + console.error(`[${type}] Error:`, { + message: error.message, + stack: error.stack, + context, + timestamp: new Date().toISOString() + }) + + // Check if we need to alert + this.checkAlertThresholds() + } + + logGraphQLError (error: any, query?: string, variables?: any): void { + const errorType = error.extensions?.code ?? 'GRAPHQL_ERROR' + this.logError(error, errorType, { query, variables }) + } + + logNetworkError (error: Error, url?: string, method?: string): void { + this.logError(error, 'NETWORK_ERROR', { url, method }) + } + + logDatabaseError (error: Error, operation?: string): void { + this.logError(error, 'DATABASE_ERROR', { operation }) + } + + private checkAlertThresholds (): void { + const now = new Date() + const oneMinuteAgo = new Date(now.getTime() - 60000) + + // Count errors in the last minute + const recentErrorCount = this.stats.recentErrors.filter( + err => err.timestamp > oneMinuteAgo + ).length + + if (recentErrorCount >= this.alertThresholds.errorsPerMinute) { + console.error(`🚨 HIGH ERROR RATE ALERT: ${recentErrorCount} errors in the last minute`) + this.logSystemStatus() + } + + if (this.stats.totalErrors >= this.alertThresholds.totalErrors) { + console.error(`🚨 HIGH TOTAL ERROR COUNT: ${this.stats.totalErrors} total errors since ${this.stats.lastReset}`) + } + } + + private logSystemStatus (): void { + const memUsage = process.memoryUsage() + const uptime = process.uptime() + + console.log('System Status:', { + uptime: `${Math.round(uptime / 60)} minutes`, + memory: { + heapUsed: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`, + heapTotal: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`, + rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB` + }, + errors: { + total: this.stats.totalErrors, + byType: Object.fromEntries(this.stats.errorsByType) + } + }) + } + + getStats (): ErrorStats { + return { + ...this.stats, + errorsByType: new Map(this.stats.errorsByType), + recentErrors: [...this.stats.recentErrors] + } + } + + reset (): void { + this.stats = { + totalErrors: 0, + errorsByType: new Map(), + recentErrors: [], + lastReset: new Date() + } + } + + // Get health status based on error rates + getHealthStatus (): 'healthy' | 'warning' | 'critical' { + const now = new Date() + const oneMinuteAgo = new Date(now.getTime() - 60000) + const fiveMinutesAgo = new Date(now.getTime() - 300000) + + const errorsLastMinute = this.stats.recentErrors.filter( + err => err.timestamp > oneMinuteAgo + ).length + + const errorsLastFiveMinutes = this.stats.recentErrors.filter( + err => err.timestamp > fiveMinutesAgo + ).length + + if (errorsLastMinute >= 10) return 'critical' + if (errorsLastFiveMinutes >= 20) return 'warning' + return 'healthy' + } +} + +// Global error monitor instance +export const errorMonitor = new ErrorMonitor() + +// Setup global error handlers +export function setupGlobalErrorHandlers (): void { + process.on('uncaughtException', (error) => { + errorMonitor.logError(error, 'UNCAUGHT_EXCEPTION') + }) + + process.on('unhandledRejection', (reason, promise) => { + const error = reason instanceof Error ? reason : new Error(String(reason)) + errorMonitor.logError(error, 'UNHANDLED_REJECTION', { promise }) + }) +} From ec7a1072f2ea2cb407915471b3a219392a0cdc1f Mon Sep 17 00:00:00 2001 From: Mike Schennum Date: Wed, 17 Sep 2025 10:59:29 -0700 Subject: [PATCH 3/5] fix ts error --- src/utils/ErrorMonitor.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/ErrorMonitor.ts b/src/utils/ErrorMonitor.ts index 9081e050..c26b9d4a 100644 --- a/src/utils/ErrorMonitor.ts +++ b/src/utils/ErrorMonitor.ts @@ -88,7 +88,7 @@ export class ErrorMonitor { } if (this.stats.totalErrors >= this.alertThresholds.totalErrors) { - console.error(`🚨 HIGH TOTAL ERROR COUNT: ${this.stats.totalErrors} total errors since ${this.stats.lastReset}`) + console.error(`🚨 HIGH TOTAL ERROR COUNT: ${this.stats.totalErrors} total errors since ${this.stats.lastReset.toISOString()}`) } } From 6114964f2d8eeb6c4afeea953f730b65f0bc700c Mon Sep 17 00:00:00 2001 From: Mike Schennum Date: Wed, 17 Sep 2025 12:22:00 -0700 Subject: [PATCH 4/5] remove error monitor --- src/server.ts | 86 ++------------------ src/utils/ErrorMonitor.ts | 163 -------------------------------------- 2 files changed, 8 insertions(+), 241 deletions(-) delete mode 100644 src/utils/ErrorMonitor.ts diff --git a/src/server.ts b/src/server.ts index 8cb63510..d6543edf 100644 --- a/src/server.ts +++ b/src/server.ts @@ -21,7 +21,6 @@ import localDevBypassAuthPermissions from './auth/local-dev/permissions.js' import MutableOrgDS from './model/MutableOrganizationDataSource.js' import UserDataSource from './model/UserDataSource.js' import BulkImportDataSource from './model/BulkImportDataSource.js' -import { errorMonitor } from './utils/ErrorMonitor.js' /** * Create a GraphQL server @@ -50,95 +49,26 @@ export async function createServer (): Promise<{ app: express.Application, serve schema, plugins: [ApolloServerPluginDrainHttpServer({ httpServer })], cache: new InMemoryLRUCache({ - max: 100, - maxSize: 1024 * 1024 * 20, // Increased cache size - ttl: 300000 // 5 minutes TTL to prevent memory leaks - }), - // Enhanced error handling - formatError: (formattedError, _error) => { - // Log the error with enhanced monitoring - errorMonitor.logGraphQLError(formattedError, undefined, undefined) - - // Don't expose internal errors in production - if (process.env.NODE_ENV === 'production') { - // Remove stack trace and internal details - if (formattedError.extensions?.exception != null) { - delete (formattedError.extensions.exception as any).stacktrace - } - if (formattedError.message.includes('internal') || - formattedError.message.includes('database')) { - return new Error('Internal server error') - } - } - - return formattedError - } + max: 50, + maxSize: 1024 * 1024 * 10 + }) }) // server must be started before applying middleware await server.start() const context = process.env.LOCAL_DEV_BYPASS_AUTH === 'true' ? localDevBypassAuthContext : createContext - // Enhanced health check with memory monitoring app.get('/health', (req, res) => { const memUsage = process.memoryUsage() - const uptime = process.uptime() - - // Check if memory usage is getting too high - const heapUsedMB = Math.round(memUsage.heapUsed / 1024 / 1024) - const heapTotalMB = Math.round(memUsage.heapTotal / 1024 / 1024) - const memoryUsagePercent = (heapUsedMB / heapTotalMB) * 100 - - const status = memoryUsagePercent > 85 ? 'warning' : 'ok' - res.json({ - status, + status: 'ok', timestamp: new Date().toISOString(), - uptime: `${Math.round(uptime / 60)} minutes`, memory: { rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB`, - heapTotal: `${heapTotalMB}MB`, - heapUsed: `${heapUsedMB}MB`, - external: `${Math.round(memUsage.external / 1024 / 1024)}MB`, - usagePercent: `${Math.round(memoryUsagePercent)}%` - }, - warnings: memoryUsagePercent > 85 ? ['High memory usage detected'] : [] - }) - - // Log warning if memory usage is high - if (memoryUsagePercent > 85) { - console.warn(`High memory usage: ${Math.round(memoryUsagePercent)}% (${heapUsedMB}MB/${heapTotalMB}MB)`) - } - }) - - // Periodic memory cleanup and monitoring - setInterval(() => { - const memUsage = process.memoryUsage() - const heapUsedMB = Math.round(memUsage.heapUsed / 1024 / 1024) - const heapTotalMB = Math.round(memUsage.heapTotal / 1024 / 1024) - const memoryUsagePercent = (heapUsedMB / heapTotalMB) * 100 - - // Force garbage collection if memory usage is high and gc is available - if (memoryUsagePercent > 80 && global.gc != null) { - console.log('Running garbage collection due to high memory usage') - global.gc() - } - - // Log memory stats every 5 minutes - console.log(`Memory usage: ${heapUsedMB}MB/${heapTotalMB}MB (${Math.round(memoryUsagePercent)}%)`) - }, 5 * 60 * 1000) // Every 5 minutes - - // Error monitoring endpoint - app.get('/errors', (req, res) => { - const stats = errorMonitor.getStats() - const healthStatus = errorMonitor.getHealthStatus() - - res.json({ - healthStatus, - totalErrors: stats.totalErrors, - errorsByType: Object.fromEntries(stats.errorsByType), - recentErrors: stats.recentErrors.slice(-10), // Last 10 errors - lastReset: stats.lastReset + heapTotal: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`, + heapUsed: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`, + external: `${Math.round(memUsage.external / 1024 / 1024)}MB` + } }) }) diff --git a/src/utils/ErrorMonitor.ts b/src/utils/ErrorMonitor.ts deleted file mode 100644 index c26b9d4a..00000000 --- a/src/utils/ErrorMonitor.ts +++ /dev/null @@ -1,163 +0,0 @@ -/** - * Enhanced error monitoring and alerting system - */ - -export interface ErrorStats { - totalErrors: number - errorsByType: Map - recentErrors: Array<{ - timestamp: Date - error: string - type: string - stack?: string - }> - lastReset: Date -} - -export class ErrorMonitor { - private stats: ErrorStats = { - totalErrors: 0, - errorsByType: new Map(), - recentErrors: [], - lastReset: new Date() - } - - private readonly maxRecentErrors = 100 - private readonly alertThresholds = { - errorsPerMinute: 10, - totalErrors: 50 - } - - logError (error: Error, type: string = 'unknown', context?: any): void { - this.stats.totalErrors++ - - // Track errors by type - const currentCount = this.stats.errorsByType.get(type) ?? 0 - this.stats.errorsByType.set(type, currentCount + 1) - - // Add to recent errors - this.stats.recentErrors.push({ - timestamp: new Date(), - error: error.message, - type, - stack: error.stack - }) - - // Keep only recent errors - if (this.stats.recentErrors.length > this.maxRecentErrors) { - this.stats.recentErrors = this.stats.recentErrors.slice(-this.maxRecentErrors) - } - - // Log the error with context - console.error(`[${type}] Error:`, { - message: error.message, - stack: error.stack, - context, - timestamp: new Date().toISOString() - }) - - // Check if we need to alert - this.checkAlertThresholds() - } - - logGraphQLError (error: any, query?: string, variables?: any): void { - const errorType = error.extensions?.code ?? 'GRAPHQL_ERROR' - this.logError(error, errorType, { query, variables }) - } - - logNetworkError (error: Error, url?: string, method?: string): void { - this.logError(error, 'NETWORK_ERROR', { url, method }) - } - - logDatabaseError (error: Error, operation?: string): void { - this.logError(error, 'DATABASE_ERROR', { operation }) - } - - private checkAlertThresholds (): void { - const now = new Date() - const oneMinuteAgo = new Date(now.getTime() - 60000) - - // Count errors in the last minute - const recentErrorCount = this.stats.recentErrors.filter( - err => err.timestamp > oneMinuteAgo - ).length - - if (recentErrorCount >= this.alertThresholds.errorsPerMinute) { - console.error(`🚨 HIGH ERROR RATE ALERT: ${recentErrorCount} errors in the last minute`) - this.logSystemStatus() - } - - if (this.stats.totalErrors >= this.alertThresholds.totalErrors) { - console.error(`🚨 HIGH TOTAL ERROR COUNT: ${this.stats.totalErrors} total errors since ${this.stats.lastReset.toISOString()}`) - } - } - - private logSystemStatus (): void { - const memUsage = process.memoryUsage() - const uptime = process.uptime() - - console.log('System Status:', { - uptime: `${Math.round(uptime / 60)} minutes`, - memory: { - heapUsed: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`, - heapTotal: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`, - rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB` - }, - errors: { - total: this.stats.totalErrors, - byType: Object.fromEntries(this.stats.errorsByType) - } - }) - } - - getStats (): ErrorStats { - return { - ...this.stats, - errorsByType: new Map(this.stats.errorsByType), - recentErrors: [...this.stats.recentErrors] - } - } - - reset (): void { - this.stats = { - totalErrors: 0, - errorsByType: new Map(), - recentErrors: [], - lastReset: new Date() - } - } - - // Get health status based on error rates - getHealthStatus (): 'healthy' | 'warning' | 'critical' { - const now = new Date() - const oneMinuteAgo = new Date(now.getTime() - 60000) - const fiveMinutesAgo = new Date(now.getTime() - 300000) - - const errorsLastMinute = this.stats.recentErrors.filter( - err => err.timestamp > oneMinuteAgo - ).length - - const errorsLastFiveMinutes = this.stats.recentErrors.filter( - err => err.timestamp > fiveMinutesAgo - ).length - - if (errorsLastMinute >= 10) return 'critical' - if (errorsLastFiveMinutes >= 20) return 'warning' - return 'healthy' - } -} - -// Global error monitor instance -export const errorMonitor = new ErrorMonitor() - -// Setup global error handlers -export function setupGlobalErrorHandlers (): void { - process.on('uncaughtException', (error) => { - errorMonitor.logError(error, 'UNCAUGHT_EXCEPTION') - }) - - process.on('unhandledRejection', (reason, promise) => { - const error = reason instanceof Error ? reason : new Error(String(reason)) - errorMonitor.logError(error, 'UNHANDLED_REJECTION', { promise }) - }) -} From 06ef88359cdd6efb4a74cdc75bb730b21ac206c5 Mon Sep 17 00:00:00 2001 From: Mike Schennum Date: Wed, 17 Sep 2025 12:28:37 -0700 Subject: [PATCH 5/5] readd error monitor file --- src/utils/ErrorMonitor.ts | 163 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 src/utils/ErrorMonitor.ts diff --git a/src/utils/ErrorMonitor.ts b/src/utils/ErrorMonitor.ts new file mode 100644 index 00000000..c26b9d4a --- /dev/null +++ b/src/utils/ErrorMonitor.ts @@ -0,0 +1,163 @@ +/** + * Enhanced error monitoring and alerting system + */ + +export interface ErrorStats { + totalErrors: number + errorsByType: Map + recentErrors: Array<{ + timestamp: Date + error: string + type: string + stack?: string + }> + lastReset: Date +} + +export class ErrorMonitor { + private stats: ErrorStats = { + totalErrors: 0, + errorsByType: new Map(), + recentErrors: [], + lastReset: new Date() + } + + private readonly maxRecentErrors = 100 + private readonly alertThresholds = { + errorsPerMinute: 10, + totalErrors: 50 + } + + logError (error: Error, type: string = 'unknown', context?: any): void { + this.stats.totalErrors++ + + // Track errors by type + const currentCount = this.stats.errorsByType.get(type) ?? 0 + this.stats.errorsByType.set(type, currentCount + 1) + + // Add to recent errors + this.stats.recentErrors.push({ + timestamp: new Date(), + error: error.message, + type, + stack: error.stack + }) + + // Keep only recent errors + if (this.stats.recentErrors.length > this.maxRecentErrors) { + this.stats.recentErrors = this.stats.recentErrors.slice(-this.maxRecentErrors) + } + + // Log the error with context + console.error(`[${type}] Error:`, { + message: error.message, + stack: error.stack, + context, + timestamp: new Date().toISOString() + }) + + // Check if we need to alert + this.checkAlertThresholds() + } + + logGraphQLError (error: any, query?: string, variables?: any): void { + const errorType = error.extensions?.code ?? 'GRAPHQL_ERROR' + this.logError(error, errorType, { query, variables }) + } + + logNetworkError (error: Error, url?: string, method?: string): void { + this.logError(error, 'NETWORK_ERROR', { url, method }) + } + + logDatabaseError (error: Error, operation?: string): void { + this.logError(error, 'DATABASE_ERROR', { operation }) + } + + private checkAlertThresholds (): void { + const now = new Date() + const oneMinuteAgo = new Date(now.getTime() - 60000) + + // Count errors in the last minute + const recentErrorCount = this.stats.recentErrors.filter( + err => err.timestamp > oneMinuteAgo + ).length + + if (recentErrorCount >= this.alertThresholds.errorsPerMinute) { + console.error(`🚨 HIGH ERROR RATE ALERT: ${recentErrorCount} errors in the last minute`) + this.logSystemStatus() + } + + if (this.stats.totalErrors >= this.alertThresholds.totalErrors) { + console.error(`🚨 HIGH TOTAL ERROR COUNT: ${this.stats.totalErrors} total errors since ${this.stats.lastReset.toISOString()}`) + } + } + + private logSystemStatus (): void { + const memUsage = process.memoryUsage() + const uptime = process.uptime() + + console.log('System Status:', { + uptime: `${Math.round(uptime / 60)} minutes`, + memory: { + heapUsed: `${Math.round(memUsage.heapUsed / 1024 / 1024)}MB`, + heapTotal: `${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`, + rss: `${Math.round(memUsage.rss / 1024 / 1024)}MB` + }, + errors: { + total: this.stats.totalErrors, + byType: Object.fromEntries(this.stats.errorsByType) + } + }) + } + + getStats (): ErrorStats { + return { + ...this.stats, + errorsByType: new Map(this.stats.errorsByType), + recentErrors: [...this.stats.recentErrors] + } + } + + reset (): void { + this.stats = { + totalErrors: 0, + errorsByType: new Map(), + recentErrors: [], + lastReset: new Date() + } + } + + // Get health status based on error rates + getHealthStatus (): 'healthy' | 'warning' | 'critical' { + const now = new Date() + const oneMinuteAgo = new Date(now.getTime() - 60000) + const fiveMinutesAgo = new Date(now.getTime() - 300000) + + const errorsLastMinute = this.stats.recentErrors.filter( + err => err.timestamp > oneMinuteAgo + ).length + + const errorsLastFiveMinutes = this.stats.recentErrors.filter( + err => err.timestamp > fiveMinutesAgo + ).length + + if (errorsLastMinute >= 10) return 'critical' + if (errorsLastFiveMinutes >= 20) return 'warning' + return 'healthy' + } +} + +// Global error monitor instance +export const errorMonitor = new ErrorMonitor() + +// Setup global error handlers +export function setupGlobalErrorHandlers (): void { + process.on('uncaughtException', (error) => { + errorMonitor.logError(error, 'UNCAUGHT_EXCEPTION') + }) + + process.on('unhandledRejection', (reason, promise) => { + const error = reason instanceof Error ? reason : new Error(String(reason)) + errorMonitor.logError(error, 'UNHANDLED_REJECTION', { promise }) + }) +}