From 129b6c33711f734999d7cc167ed1391b12e19777 Mon Sep 17 00:00:00 2001 From: Dmitry Alexeenko Date: Tue, 17 Feb 2026 15:38:13 +0000 Subject: [PATCH] Improve gateway restart reliability, add diagnostic endpoint, clean stale AI Gateway providers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gateway restart handler was unreliable — Process.kill() didn't always terminate the openclaw gateway, leaving stale processes and lock files. This overhauls the restart flow and adds a diagnostic endpoint for debugging API connectivity and AI Gateway configuration issues. Gateway restart improvements: - Force kill via pkill -9 before falling back to Process API - Remove lock files (/tmp/openclaw-gateway.lock, gateway.lock) - Wait for process to fully die before restarting - Clean stale AI Gateway providers (cf-ai-gw-*, cloudflare-ai-gateway) from openclaw config on restart to prevent config validation failures Diagnostic endpoint (GET /api/admin/diagnostic): - Shows Worker env var status (masked) for all AI-related secrets - Constructs and displays AI Gateway URL (mirrors start-openclaw.sh logic) - Reads openclaw config from container showing providers and default model - Checks gateway process status via ps - Tests direct Anthropic API connectivity from inside container - Tests AI Gateway URL connectivity when configured Stale provider cleanup in start-openclaw.sh: - When CF_AI_GATEWAY_MODEL is not set, remove any cf-ai-gw-* providers restored from R2 backup and reset default model if it referenced one - Prevents config validation failures from stale R2 backups Co-Authored-By: Claude Opus 4.6 --- src/gateway/index.ts | 1 + src/routes/api.ts | 208 ++++++++++++++++++++++++++++++++++++++++--- start-openclaw.sh | 18 ++++ 3 files changed, 217 insertions(+), 10 deletions(-) diff --git a/src/gateway/index.ts b/src/gateway/index.ts index b54f1a0d8..1eba55c04 100644 --- a/src/gateway/index.ts +++ b/src/gateway/index.ts @@ -1,4 +1,5 @@ export { ensureMoltbotGateway, findExistingMoltbotProcess } from './process'; +export { buildEnvVars } from './env'; export { waitForProcess } from './utils'; export { ensureRcloneConfig } from './r2'; export { syncToR2 } from './sync'; diff --git a/src/routes/api.ts b/src/routes/api.ts index 65cf62f8d..b26741149 100644 --- a/src/routes/api.ts +++ b/src/routes/api.ts @@ -2,6 +2,7 @@ import { Hono } from 'hono'; import type { AppEnv } from '../types'; import { createAccessMiddleware } from '../auth'; import { + buildEnvVars, ensureMoltbotGateway, findExistingMoltbotProcess, syncToR2, @@ -262,18 +263,84 @@ adminApi.post('/gateway/restart', async (c) => { const sandbox = c.get('sandbox'); try { - // Find and kill the existing gateway process - const existingProcess = await findExistingMoltbotProcess(sandbox); + // Force kill gateway via exec (more reliable than Process.kill()) + try { + await sandbox.exec('pkill -9 -f "openclaw gateway" 2>/dev/null || true'); + } catch { + // Ignore - process may not exist + } + // Also try the Process API + const existingProcess = await findExistingMoltbotProcess(sandbox); if (existingProcess) { - console.log('Killing existing gateway process:', existingProcess.id); + console.log('Also killing via Process API:', existingProcess.id); try { await existingProcess.kill(); - } catch (killErr) { - console.error('Error killing process:', killErr); + } catch { + // Ignore } - // Wait a moment for the process to die - await new Promise((r) => setTimeout(r, 2000)); + } + + // Clean up lock files + try { + await sandbox.exec( + 'rm -f /tmp/openclaw-gateway.lock /root/.openclaw/gateway.lock 2>/dev/null || true', + ); + } catch { + // Ignore + } + + // Wait for process to fully die + await new Promise((r) => setTimeout(r, 3000)); + + // Verify it's dead + try { + const check = await sandbox.exec('pgrep -f "openclaw gateway" || echo "dead"'); + console.log('[Restart] Process check after kill:', check.stdout?.trim()); + } catch { + // Ignore + } + + // Clean up stale providers and ensure API key is in config + try { + const anthropicKey = c.env.ANTHROPIC_API_KEY || ''; + const fixScript = `node -e " + const fs = require('fs'); + const p = '/root/.openclaw/openclaw.json'; + if (fs.existsSync(p)) { + const c = JSON.parse(fs.readFileSync(p, 'utf8')); + let changed = false; + c.models = c.models || {}; + c.models.providers = c.models.providers || {}; + // Remove stale AI Gateway providers + for (const k of Object.keys(c.models.providers)) { + if (k.startsWith('cf-ai-gw-') || k === 'cloudflare-ai-gateway') { + delete c.models.providers[k]; + changed = true; + console.log('Removed provider: ' + k); + } + } + // Reset default model if it references a removed provider + if (c.agents && c.agents.defaults && c.agents.defaults.model) { + const pr = (c.agents.defaults.model.primary || ''); + if (pr.startsWith('cf-ai-gw-') || pr.startsWith('cloudflare-ai-gateway')) { + delete c.agents.defaults.model; + changed = true; + console.log('Reset default model: ' + pr); + } + } + if (changed) { + fs.writeFileSync(p, JSON.stringify(c, null, 2)); + console.log('Config fixed'); + } else { + console.log('Config OK'); + } + } + "`; + const result = await sandbox.exec(fixScript); + console.log('[Config cleanup] stdout:', result.stdout, 'stderr:', result.stderr); + } catch (fixErr) { + console.error('[Config cleanup] Failed:', fixErr); } // Start a new gateway in the background @@ -284,9 +351,7 @@ adminApi.post('/gateway/restart', async (c) => { return c.json({ success: true, - message: existingProcess - ? 'Gateway process killed, new instance starting...' - : 'No existing process found, starting new instance...', + message: 'Gateway killed, lock files removed, new instance starting...', previousProcessId: existingProcess?.id, }); } catch (error) { @@ -295,6 +360,129 @@ adminApi.post('/gateway/restart', async (c) => { } }); +// GET /api/admin/diagnostic - Diagnose API connectivity and config issues +adminApi.get('/diagnostic', async (c) => { + const sandbox = c.get('sandbox'); + const results: Record = {}; + + // 1. Worker env vars + results.workerEnvKeys = { + ANTHROPIC_API_KEY: c.env.ANTHROPIC_API_KEY?.substring(0, 10) || 'NOT SET', + CLOUDFLARE_AI_GATEWAY_API_KEY: c.env.CLOUDFLARE_AI_GATEWAY_API_KEY?.substring(0, 10) || 'NOT SET', + CF_AI_GATEWAY_MODEL: c.env.CF_AI_GATEWAY_MODEL || 'NOT SET', + CF_AI_GATEWAY_ACCOUNT_ID: c.env.CF_AI_GATEWAY_ACCOUNT_ID || 'NOT SET', + CF_AI_GATEWAY_GATEWAY_ID: c.env.CF_AI_GATEWAY_GATEWAY_ID || 'NOT SET', + MOLTBOT_GATEWAY_TOKEN: !!c.env.MOLTBOT_GATEWAY_TOKEN, + }; + + // 2. AI Gateway URL construction (mirrors start-openclaw.sh logic) + if (c.env.CF_AI_GATEWAY_MODEL) { + const raw = c.env.CF_AI_GATEWAY_MODEL; + const slashIdx = raw.indexOf('/'); + const gwProvider = slashIdx > 0 ? raw.substring(0, slashIdx) : 'unknown'; + const modelId = slashIdx > 0 ? raw.substring(slashIdx + 1) : raw; + const accountId = c.env.CF_AI_GATEWAY_ACCOUNT_ID; + const gatewayId = c.env.CF_AI_GATEWAY_GATEWAY_ID; + let baseUrl = ''; + if (accountId && gatewayId) { + baseUrl = `https://gateway.ai.cloudflare.com/v1/${accountId}/${gatewayId}/${gwProvider}`; + if (gwProvider === 'workers-ai') baseUrl += '/v1'; + } + const providerName = `cf-ai-gw-${gwProvider}`; + const api = gwProvider === 'anthropic' ? 'anthropic-messages' : 'openai-completions'; + results.aiGateway = { + cfAiGatewayModel: raw, + gwProvider, + modelId, + providerName, + api, + baseUrl, + hasApiKey: !!c.env.CLOUDFLARE_AI_GATEWAY_API_KEY, + }; + } else { + results.aiGateway = { status: 'CF_AI_GATEWAY_MODEL not set - AI Gateway disabled' }; + } + + // 3. Read openclaw config from container + try { + const configResult = await sandbox.exec('cat /root/.openclaw/openclaw.json'); + const config = JSON.parse(configResult.stdout || '{}'); + const providers = config.models?.providers || {}; + const maskedProviders: Record = {}; + for (const [name, prov] of Object.entries(providers)) { + const p = prov as Record; + maskedProviders[name] = { + baseUrl: p.baseUrl || 'default', + api: p.api || 'default', + hasApiKey: !!p.apiKey, + apiKeyPrefix: typeof p.apiKey === 'string' ? p.apiKey.substring(0, 10) : 'none', + models: p.models, + }; + } + results.openclawConfig = { + providers: maskedProviders, + defaultModel: config.agents?.defaults?.model || 'not set (uses built-in)', + }; + } catch (err) { + results.openclawConfig = { error: err instanceof Error ? err.message : 'Failed to read' }; + } + + // 4. Gateway process check + try { + const pidCheck = await sandbox.exec( + 'ps aux | grep "openclaw gateway" | grep -v grep | head -3', + ); + results.gatewayProcess = pidCheck.stdout?.trim() || 'not found'; + } catch { + results.gatewayProcess = 'check failed'; + } + + // 5. Test direct Anthropic API + try { + const apiKey = c.env.ANTHROPIC_API_KEY || ''; + const curlResult = await sandbox.exec( + `curl -s -w "\\n---HTTP_CODE:%{http_code}---" -X POST https://api.anthropic.com/v1/messages -H "content-type: application/json" -H "x-api-key: ${apiKey}" -H "anthropic-version: 2023-06-01" -d '{"model":"claude-sonnet-4-5-20250929","max_tokens":10,"messages":[{"role":"user","content":"hi"}]}' 2>&1 | head -5`, + ); + const output = curlResult.stdout || ''; + const httpCodeMatch = output.match(/---HTTP_CODE:(\d+)---/); + results.directApi = { httpCode: httpCodeMatch?.[1] || 'unknown' }; + } catch (err) { + results.directApi = { error: err instanceof Error ? err.message : 'failed' }; + } + + // 6. Test AI Gateway URL (if configured) + if (c.env.CF_AI_GATEWAY_MODEL && c.env.CF_AI_GATEWAY_ACCOUNT_ID && c.env.CF_AI_GATEWAY_GATEWAY_ID) { + try { + const raw = c.env.CF_AI_GATEWAY_MODEL; + const slashIdx = raw.indexOf('/'); + const gwProvider = slashIdx > 0 ? raw.substring(0, slashIdx) : ''; + const modelId = slashIdx > 0 ? raw.substring(slashIdx + 1) : raw; + const baseUrl = `https://gateway.ai.cloudflare.com/v1/${c.env.CF_AI_GATEWAY_ACCOUNT_ID}/${c.env.CF_AI_GATEWAY_GATEWAY_ID}/${gwProvider}`; + const apiKey = c.env.CLOUDFLARE_AI_GATEWAY_API_KEY || c.env.ANTHROPIC_API_KEY || ''; + + if (gwProvider === 'anthropic') { + const curlResult = await sandbox.exec( + `curl -s -w "\\n---HTTP_CODE:%{http_code}---" -X POST "${baseUrl}/v1/messages" -H "content-type: application/json" -H "x-api-key: ${apiKey}" -H "anthropic-version: 2023-06-01" -d '{"model":"${modelId}","max_tokens":10,"messages":[{"role":"user","content":"hi"}]}' 2>&1 | head -10`, + ); + const output = curlResult.stdout || ''; + const httpCodeMatch = output.match(/---HTTP_CODE:(\d+)---/); + const body = output.replace(/---HTTP_CODE:\d+---/, '').trim(); + results.aiGatewayTest = { + url: `${baseUrl}/v1/messages`, + httpCode: httpCodeMatch?.[1] || 'unknown', + response: body.substring(0, 500), + }; + } else { + results.aiGatewayTest = { status: `Non-anthropic provider: ${gwProvider}`, url: baseUrl }; + } + } catch (err) { + results.aiGatewayTest = { error: err instanceof Error ? err.message : 'failed' }; + } + } + + return c.json(results); +}); + // Mount admin API routes under /admin api.route('/admin', adminApi); diff --git a/start-openclaw.sh b/start-openclaw.sh index c862a80ce..66b79a6b7 100644 --- a/start-openclaw.sh +++ b/start-openclaw.sh @@ -217,6 +217,24 @@ if (process.env.CF_AI_GATEWAY_MODEL) { } else { console.warn('CF_AI_GATEWAY_MODEL set but missing required config (account ID, gateway ID, or API key)'); } +} else { + // No AI Gateway model override — clean up any stale cf-ai-gw- providers + // restored from R2 backup and reset default model to built-in anthropic. + if (config.models && config.models.providers) { + for (const key of Object.keys(config.models.providers)) { + if (key.startsWith('cf-ai-gw-')) { + delete config.models.providers[key]; + console.log('Removed stale AI Gateway provider: ' + key); + } + } + } + if (config.agents && config.agents.defaults && config.agents.defaults.model) { + const primary = config.agents.defaults.model.primary || ''; + if (primary.startsWith('cf-ai-gw-')) { + delete config.agents.defaults.model; + console.log('Reset default model (was using removed AI Gateway provider: ' + primary + ')'); + } + } } // Telegram configuration