From a6e9af4ab155736e0080e66153f831f856603abb Mon Sep 17 00:00:00 2001
From: Claude Code <noreply@anthropic.com>
Date: Sun, 8 Mar 2026 06:37:22 +0200
Subject: [PATCH 1/2] fix: panic recovery returns proper error result; safe
 tokenizer nil handling [#318]

Three independent fixes for the crash chain reported in #318:

1. Named return values in panic recovery (handleCallToolVariant, handleCallTool):
   recover() now sets callResult via named returns instead of returning (nil, nil),
   preventing the second unrecovered panic in mcp-go's HandleMessage.

2. Safe tokenizer access (SafeGetDefaultEncoding):
   Replaces unsafe type assertion `tokenizer.(*DefaultTokenizer).GetDefaultEncoding()`
   with a nil-safe helper that handles nil interface, nil underlying value, and
   non-DefaultTokenizer implementations.

3. Skip tiktoken validation for disabled tokenizer (NewTokenizer):
   When enabled=false, encoding validation is skipped so the fallback path always
   succeeds even with a corrupted tiktoken cache.

4. Guard against nil serverConfig in GenerateServerID calls.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 internal/server/mcp.go                     | 40 +++++++++---
 internal/server/mcp_panic_recovery_test.go | 72 ++++++++++++++++++++++
 internal/server/tokens/tokenizer.go        | 32 ++++++++--
 internal/server/tokens/tokenizer_test.go   | 67 ++++++++++++++++++++
 4 files changed, 196 insertions(+), 15 deletions(-)
 create mode 100644 internal/server/mcp_panic_recovery_test.go

diff --git a/internal/server/mcp.go b/internal/server/mcp.go
index 1ff37ee6..17aee5fa 100644
--- a/internal/server/mcp.go
+++ b/internal/server/mcp.go
@@ -1064,17 +1064,22 @@ func (p *MCPProxyServer) handleCallToolDestructive(ctx context.Context, request
 }
 
 // handleCallToolVariant is the common handler for all call_tool_* variants (Spec 018)
-func (p *MCPProxyServer) handleCallToolVariant(ctx context.Context, request mcp.CallToolRequest, toolVariant string) (*mcp.CallToolResult, error) {
+func (p *MCPProxyServer) handleCallToolVariant(ctx context.Context, request mcp.CallToolRequest, toolVariant string) (callResult *mcp.CallToolResult, callErr error) {
 	// Spec 024: Track start time and context for internal tool call logging
 	internalStartTime := time.Now()
 
-	// Add panic recovery to ensure server resilience
+	// Panic recovery with named return values to prevent returning (nil, nil).
+	// Issue #318: unnamed returns caused recover() to return zero values,
+	// triggering a second unrecovered panic in mcp-go when dereferencing nil *CallToolResult.
 	defer func() {
 		if r := recover(); r != nil {
 			p.logger.Error("Recovered from panic in handleCallToolVariant",
 				zap.Any("panic", r),
 				zap.String("tool_variant", toolVariant),
-				zap.Any("request", request))
+				zap.Any("request", request),
+				zap.Stack("stacktrace"))
+			callResult = mcp.NewToolResultError(fmt.Sprintf("Internal proxy error: recovered from panic: %v", r))
+			callErr = nil
 		}
 	}()
 
@@ -1326,15 +1331,21 @@ func (p *MCPProxyServer) handleCallToolVariant(ctx context.Context, request mcp.
 			tokenMetrics = &storage.TokenMetrics{
 				InputTokens: inputTokens,
 				Model:       model,
-				Encoding:    tokenizer.(*tokens.DefaultTokenizer).GetDefaultEncoding(),
+				Encoding:    tokens.SafeGetDefaultEncoding(tokenizer),
 			}
 		}
 	}
 
+	// Derive server ID safely (serverConfig may be nil if storage lookup failed)
+	var serverID string
+	if serverConfig != nil {
+		serverID = storage.GenerateServerID(serverConfig)
+	}
+
 	// Record tool call for history (even if error)
 	toolCallRecord := &storage.ToolCallRecord{
 		ID:               fmt.Sprintf("%d-%s", time.Now().UnixNano(), actualToolName),
-		ServerID:         storage.GenerateServerID(serverConfig),
+		ServerID:         serverID,
 		ServerName:       serverName,
 		ToolName:         actualToolName,
 		Arguments:        args,
@@ -1493,13 +1504,16 @@ func (p *MCPProxyServer) handleCallToolVariant(ctx context.Context, request mcp.
 }
 
 // handleCallTool is the LEGACY call_tool handler - returns error directing to new variants (Spec 018)
-func (p *MCPProxyServer) handleCallTool(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
-	// Add panic recovery to ensure server resilience
+func (p *MCPProxyServer) handleCallTool(ctx context.Context, request mcp.CallToolRequest) (callResult *mcp.CallToolResult, callErr error) {
+	// Panic recovery with named return values (issue #318).
 	defer func() {
 		if r := recover(); r != nil {
 			p.logger.Error("Recovered from panic in handleCallTool",
 				zap.Any("panic", r),
-				zap.Any("request", request))
+				zap.Any("request", request),
+				zap.Stack("stacktrace"))
+			callResult = mcp.NewToolResultError(fmt.Sprintf("Internal proxy error: recovered from panic: %v", r))
+			callErr = nil
 		}
 	}()
 
@@ -1705,15 +1719,21 @@ func (p *MCPProxyServer) handleCallTool(ctx context.Context, request mcp.CallToo
 			tokenMetrics = &storage.TokenMetrics{
 				InputTokens: inputTokens,
 				Model:       model,
-				Encoding:    tokenizer.(*tokens.DefaultTokenizer).GetDefaultEncoding(),
+				Encoding:    tokens.SafeGetDefaultEncoding(tokenizer),
 			}
 		}
 	}
 
+	// Derive server ID safely (serverConfig may be nil if storage lookup failed)
+	var serverID string
+	if serverConfig != nil {
+		serverID = storage.GenerateServerID(serverConfig)
+	}
+
 	// Record tool call for history (even if error)
 	toolCallRecord := &storage.ToolCallRecord{
 		ID:               fmt.Sprintf("%d-%s", time.Now().UnixNano(), actualToolName),
-		ServerID:         storage.GenerateServerID(serverConfig),
+		ServerID:         serverID,
 		ServerName:       serverName,
 		ToolName:         actualToolName,
 		Arguments:        args,
diff --git a/internal/server/mcp_panic_recovery_test.go b/internal/server/mcp_panic_recovery_test.go
new file mode 100644
index 00000000..97935a80
--- /dev/null
+++ b/internal/server/mcp_panic_recovery_test.go
@@ -0,0 +1,72 @@
+package server
+
+import (
+	"context"
+	"testing"
+
+	"github.com/mark3labs/mcp-go/mcp"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/contracts"
+)
+
+func TestHandleCallToolVariant_PanicRecovery(t *testing.T) {
+	// Issue #318: When handleCallToolVariant panics (e.g., nil tokenizer dereference),
+	// the recover() block must return a proper error result, not (nil, nil).
+	// Returning (nil, nil) causes a second unrecovered panic in mcp-go's HandleMessage
+	// when it dereferences the nil *CallToolResult.
+	proxy := createTestMCPProxyServer(t)
+
+	// Create a request that will reach upstream call and trigger a panic.
+	// We use a server:tool format with a server that doesn't exist in upstream manager.
+	// The upstream manager's CallTool will be called, which may panic or error.
+	// To reliably trigger a panic, we'll test the recovery mechanism directly.
+	t.Run("panic returns error result not nil", func(t *testing.T) {
+		// Use a tool name with ":" to skip the "no server prefix" error path
+		// and reach the upstream call code. The upstream manager has no servers,
+		// so CallTool will error — but we want to test the panic path.
+		//
+		// We'll craft a scenario that causes a nil pointer dereference.
+		// An empty serverConfig from storage + GenerateServerID panics on nil.
+		request := mcp.CallToolRequest{}
+		request.Params.Arguments = map[string]interface{}{
+			"name": "nonexistent-server:some_tool",
+		}
+
+		result, err := proxy.handleCallToolVariant(context.Background(), request, contracts.ToolVariantRead)
+
+		// After fix: must never return (nil, nil). Either result or err must be set.
+		if err != nil {
+			// An error is acceptable
+			return
+		}
+		require.NotNil(t, result, "panic recovery must return a non-nil result, not (nil, nil)")
+		assert.True(t, result.IsError, "recovered panic should be an error result")
+	})
+}
+
+func TestHandleCallToolVariant_PanicRecoveryReturnsErrorResult(t *testing.T) {
+	// Directly test that the named return values mechanism works by verifying
+	// the function signature contract: on any internal panic, we get back a
+	// usable *CallToolResult (never nil).
+	proxy := createTestMCPProxyServer(t)
+
+	// Request a tool on a valid-looking but nonexistent server
+	request := mcp.CallToolRequest{}
+	request.Params.Arguments = map[string]interface{}{
+		"name": "fake:tool",
+	}
+
+	result, err := proxy.handleCallToolVariant(context.Background(), request, contracts.ToolVariantWrite)
+
+	// The function must NEVER return (nil, nil)
+	assert.True(t, result != nil || err != nil,
+		"handleCallToolVariant must never return (nil, nil) — panic recovery must set named return values")
+
+	if result != nil && err == nil {
+		// If we got a result without error, it should be marked as an error result
+		// (either from normal error handling or from panic recovery)
+		assert.True(t, result.IsError, "error results should have IsError=true")
+	}
+}
diff --git a/internal/server/tokens/tokenizer.go b/internal/server/tokens/tokenizer.go
index 8dfc49fb..ddead6f4 100644
--- a/internal/server/tokens/tokenizer.go
+++ b/internal/server/tokens/tokenizer.go
@@ -36,16 +36,22 @@ type DefaultTokenizer struct {
 	enabled         bool
 }
 
-// NewTokenizer creates a new tokenizer instance
+// NewTokenizer creates a new tokenizer instance.
+// When enabled=false, encoding validation is skipped so that a disabled
+// tokenizer can always be created (even with a corrupted tiktoken cache).
+// See issue #318.
 func NewTokenizer(defaultEncoding string, logger *zap.SugaredLogger, enabled bool) (*DefaultTokenizer, error) {
 	if defaultEncoding == "" {
 		defaultEncoding = DefaultEncoding
 	}
 
-	// Validate encoding exists
-	_, err := tiktoken.GetEncoding(defaultEncoding)
-	if err != nil {
-		return nil, fmt.Errorf("invalid encoding %q: %w", defaultEncoding, err)
+	// Only validate encoding when tokenizer is enabled.
+	// A disabled tokenizer never calls tiktoken, so a corrupted cache is harmless.
+	if enabled {
+		_, err := tiktoken.GetEncoding(defaultEncoding)
+		if err != nil {
+			return nil, fmt.Errorf("invalid encoding %q: %w", defaultEncoding, err)
+		}
 	}
 
 	return &DefaultTokenizer{
@@ -191,3 +197,19 @@ func (t *DefaultTokenizer) GetDefaultEncoding() string {
 	defer t.mu.RUnlock()
 	return t.defaultEncoding
 }
+
+// SafeGetDefaultEncoding extracts the default encoding from a Tokenizer interface
+// without risking a panic. Handles nil interface, nil underlying *DefaultTokenizer,
+// and non-DefaultTokenizer implementations. Returns DefaultEncoding as fallback.
+// See issue #318: a nil *DefaultTokenizer assigned to a Tokenizer interface creates
+// a non-nil interface with nil concrete value, causing panics on type assertion.
+func SafeGetDefaultEncoding(t Tokenizer) string {
+	if t == nil {
+		return DefaultEncoding
+	}
+	dt, ok := t.(*DefaultTokenizer)
+	if !ok || dt == nil {
+		return DefaultEncoding
+	}
+	return dt.GetDefaultEncoding()
+}
diff --git a/internal/server/tokens/tokenizer_test.go b/internal/server/tokens/tokenizer_test.go
index 57d7b18c..d463d2fb 100644
--- a/internal/server/tokens/tokenizer_test.go
+++ b/internal/server/tokens/tokenizer_test.go
@@ -328,3 +328,70 @@ func TestSupportedEncodings(t *testing.T) {
 	assert.Contains(t, encodings, "p50k_base")
 	assert.Contains(t, encodings, "r50k_base")
 }
+
+func TestNewTokenizerDisabledWithInvalidEncoding(t *testing.T) {
+	logger := zap.NewNop().Sugar()
+
+	// When disabled, NewTokenizer should succeed even with an invalid encoding
+	// (e.g., tiktoken cache is corrupted). This prevents the fallback path
+	// from also failing and leaving a nil tokenizer. See issue #318.
+	tokenizer, err := NewTokenizer("invalid_encoding_corrupted", logger, false)
+	require.NoError(t, err, "disabled tokenizer should not validate encoding")
+	require.NotNil(t, tokenizer, "disabled tokenizer should never be nil")
+	assert.False(t, tokenizer.IsEnabled())
+
+	// All methods should return zero without error when disabled
+	count, err := tokenizer.CountTokens("Hello, world!")
+	require.NoError(t, err)
+	assert.Equal(t, 0, count)
+
+	count, err = tokenizer.CountTokensForModel("Hello", "gpt-4")
+	require.NoError(t, err)
+	assert.Equal(t, 0, count)
+
+	count, err = tokenizer.CountTokensInJSON(map[string]string{"key": "value"})
+	require.NoError(t, err)
+	assert.Equal(t, 0, count)
+}
+
+func TestNilTokenizerInterfaceSafety(t *testing.T) {
+	// Simulates the scenario from issue #318 where a nil *DefaultTokenizer
+	// is assigned to a Tokenizer interface, creating a non-nil interface
+	// with a nil underlying value. SafeGetDefaultEncoding must handle this.
+	var nilTokenizer *DefaultTokenizer
+
+	// Assign nil concrete to interface — interface is non-nil but underlying is nil.
+	// In Go, the interface value itself is non-nil (it holds type info), but the
+	// concrete pointer inside is nil. This is the exact bug scenario.
+	var iface Tokenizer = nilTokenizer
+	_ = iface // used below
+
+	// Calling methods on this would panic — that's the bug.
+	// SafeGetDefaultEncoding should handle this safely without panic.
+	encoding := SafeGetDefaultEncoding(iface)
+	assert.Equal(t, DefaultEncoding, encoding, "should return default encoding for nil underlying")
+}
+
+func TestSafeGetDefaultEncoding(t *testing.T) {
+	logger := zap.NewNop().Sugar()
+
+	t.Run("valid DefaultTokenizer", func(t *testing.T) {
+		tokenizer, err := NewTokenizer("o200k_base", logger, true)
+		require.NoError(t, err)
+
+		encoding := SafeGetDefaultEncoding(tokenizer)
+		assert.Equal(t, "o200k_base", encoding)
+	})
+
+	t.Run("nil interface", func(t *testing.T) {
+		encoding := SafeGetDefaultEncoding(nil)
+		assert.Equal(t, DefaultEncoding, encoding)
+	})
+
+	t.Run("nil DefaultTokenizer in interface", func(t *testing.T) {
+		var nilDT *DefaultTokenizer
+		var iface Tokenizer = nilDT
+		encoding := SafeGetDefaultEncoding(iface)
+		assert.Equal(t, DefaultEncoding, encoding)
+	})
+}

From 7e9c6983a690f0fa5287ee8ea698e7a22f936c3f Mon Sep 17 00:00:00 2001
From: Claude Code <noreply@anthropic.com>
Date: Sun, 8 Mar 2026 07:05:47 +0200
Subject: [PATCH 2/2] test: exercise actual panic-and-recover path in
 handleCallToolVariant [#318]

Rewrite panic recovery tests to trigger real panics by nil-ing out
proxy.storage, causing nil pointer dereference inside the handler.
This verifies the named-return-value recover block actually catches
panics and returns a proper error result (not nil, nil).

Three subtests:
- normal error path (server not found) returns error result
- panic in handleCallToolVariant returns error result via recover
- panic in legacy handleCallTool returns error result via recover

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 internal/server/mcp_panic_recovery_test.go | 104 ++++++++++++---------
 1 file changed, 62 insertions(+), 42 deletions(-)

diff --git a/internal/server/mcp_panic_recovery_test.go b/internal/server/mcp_panic_recovery_test.go
index 97935a80..6488cea7 100644
--- a/internal/server/mcp_panic_recovery_test.go
+++ b/internal/server/mcp_panic_recovery_test.go
@@ -16,19 +16,10 @@ func TestHandleCallToolVariant_PanicRecovery(t *testing.T) {
 	// the recover() block must return a proper error result, not (nil, nil).
 	// Returning (nil, nil) causes a second unrecovered panic in mcp-go's HandleMessage
 	// when it dereferences the nil *CallToolResult.
-	proxy := createTestMCPProxyServer(t)
-
-	// Create a request that will reach upstream call and trigger a panic.
-	// We use a server:tool format with a server that doesn't exist in upstream manager.
-	// The upstream manager's CallTool will be called, which may panic or error.
-	// To reliably trigger a panic, we'll test the recovery mechanism directly.
-	t.Run("panic returns error result not nil", func(t *testing.T) {
-		// Use a tool name with ":" to skip the "no server prefix" error path
-		// and reach the upstream call code. The upstream manager has no servers,
-		// so CallTool will error — but we want to test the panic path.
-		//
-		// We'll craft a scenario that causes a nil pointer dereference.
-		// An empty serverConfig from storage + GenerateServerID panics on nil.
+
+	t.Run("normal error path returns error result", func(t *testing.T) {
+		proxy := createTestMCPProxyServer(t)
+
 		request := mcp.CallToolRequest{}
 		request.Params.Arguments = map[string]interface{}{
 			"name": "nonexistent-server:some_tool",
@@ -36,37 +27,66 @@ func TestHandleCallToolVariant_PanicRecovery(t *testing.T) {
 
 		result, err := proxy.handleCallToolVariant(context.Background(), request, contracts.ToolVariantRead)
 
-		// After fix: must never return (nil, nil). Either result or err must be set.
-		if err != nil {
-			// An error is acceptable
-			return
+		// Must never return (nil, nil)
+		assert.True(t, result != nil || err != nil,
+			"handleCallToolVariant must never return (nil, nil)")
+		if result != nil {
+			assert.True(t, result.IsError, "error results should have IsError=true")
 		}
-		require.NotNil(t, result, "panic recovery must return a non-nil result, not (nil, nil)")
+	})
+
+	t.Run("panic in handler returns error result via recover", func(t *testing.T) {
+		proxy := createTestMCPProxyServer(t)
+
+		// Nil out storage to trigger a guaranteed nil pointer panic inside
+		// handleCallToolVariant at p.storage.GetUpstreamServer(). This exercises
+		// the actual recover() path — the function panics, the deferred recover
+		// catches it, and sets callResult via named return values.
+		proxy.storage = nil
+
+		request := mcp.CallToolRequest{}
+		request.Params.Arguments = map[string]interface{}{
+			"name": "panic-server:panic_tool",
+		}
+
+		result, err := proxy.handleCallToolVariant(context.Background(), request, contracts.ToolVariantRead)
+
+		// Before fix: would return (nil, nil), crashing mcp-go.
+		// After fix: must return a proper error result.
+		require.NoError(t, err, "recovered panic should not return an error")
+		require.NotNil(t, result, "recovered panic must return a non-nil CallToolResult, not (nil, nil)")
 		assert.True(t, result.IsError, "recovered panic should be an error result")
+
+		// Verify the error message mentions the panic
+		if len(result.Content) > 0 {
+			if text, ok := result.Content[0].(mcp.TextContent); ok {
+				assert.Contains(t, text.Text, "Internal proxy error")
+				assert.Contains(t, text.Text, "recovered from panic")
+			}
+		}
 	})
-}
 
-func TestHandleCallToolVariant_PanicRecoveryReturnsErrorResult(t *testing.T) {
-	// Directly test that the named return values mechanism works by verifying
-	// the function signature contract: on any internal panic, we get back a
-	// usable *CallToolResult (never nil).
-	proxy := createTestMCPProxyServer(t)
-
-	// Request a tool on a valid-looking but nonexistent server
-	request := mcp.CallToolRequest{}
-	request.Params.Arguments = map[string]interface{}{
-		"name": "fake:tool",
-	}
-
-	result, err := proxy.handleCallToolVariant(context.Background(), request, contracts.ToolVariantWrite)
-
-	// The function must NEVER return (nil, nil)
-	assert.True(t, result != nil || err != nil,
-		"handleCallToolVariant must never return (nil, nil) — panic recovery must set named return values")
-
-	if result != nil && err == nil {
-		// If we got a result without error, it should be marked as an error result
-		// (either from normal error handling or from panic recovery)
-		assert.True(t, result.IsError, "error results should have IsError=true")
-	}
+	t.Run("panic in legacy handleCallTool returns error result via recover", func(t *testing.T) {
+		proxy := createTestMCPProxyServer(t)
+
+		// Same approach: nil storage triggers panic in handleCallTool
+		proxy.storage = nil
+
+		request := mcp.CallToolRequest{}
+		request.Params.Arguments = map[string]interface{}{
+			"name": "panic-server:panic_tool",
+		}
+
+		result, err := proxy.handleCallTool(context.Background(), request)
+
+		require.NoError(t, err, "recovered panic should not return an error")
+		require.NotNil(t, result, "recovered panic must return a non-nil CallToolResult, not (nil, nil)")
+		assert.True(t, result.IsError, "recovered panic should be an error result")
+
+		if len(result.Content) > 0 {
+			if text, ok := result.Content[0].(mcp.TextContent); ok {
+				assert.Contains(t, text.Text, "Internal proxy error")
+			}
+		}
+	})
 }