redis
diff --git a/‎config_comparison_test.go‎
Lines changed: 226 additions & 0 deletions b/‎config_comparison_test.go‎
Lines changed: 226 additions & 0 deletions
diff --git a/‎internal/pool/double_freeturn_simple_test.go‎
Lines changed: 158 additions & 0 deletions b/‎internal/pool/double_freeturn_simple_test.go‎
Lines changed: 158 additions & 0 deletions
@@ -0,0 +1,226 @@
+package redis
+
+import (
+	"context"
+	"fmt"
+	"net"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+// TestBadConfigurationHighLoad demonstrates the problem with default configuration
+// under high load with slow dials.
+func TestBadConfigurationHighLoad(t *testing.T) {
+	var dialCount atomic.Int32
+	var dialsFailed atomic.Int32
+	var dialsSucceeded atomic.Int32
+	
+	// Simulate slow network - 300ms per dial (e.g., network latency, TLS handshake)
+	slowDialer := func(ctx context.Context, network, addr string) (net.Conn, error) {
+		dialCount.Add(1)
+		select {
+		case <-time.After(300 * time.Millisecond):
+			dialsSucceeded.Add(1)
+			return &net.TCPConn{}, nil
+		case <-ctx.Done():
+			dialsFailed.Add(1)
+			return nil, ctx.Err()
+		}
+	}
+
+	// BAD CONFIGURATION: Default settings
+	// On an 8-CPU machine:
+	// - PoolSize = 10 * 8 = 80
+	// - MaxConcurrentDials = 80
+	// - MinIdleConns = 0 (no pre-warming)
+	opt := &Options{
+		Addr:               "localhost:6379",
+		Dialer:             slowDialer,
+		PoolSize:           80,  // Default: 10 * GOMAXPROCS
+		MaxConcurrentDials: 80,  // Default: same as PoolSize
+		MinIdleConns:       0,   // Default: no pre-warming
+		DialTimeout:        5 * time.Second,
+	}
+
+	client := NewClient(opt)
+	defer client.Close()
+
+	// Simulate high load: 200 concurrent requests with 200ms timeout
+	// This simulates a burst of traffic (e.g., after a deployment or cache miss)
+	const numRequests = 200
+	const requestTimeout = 200 * time.Millisecond
+
+	var wg sync.WaitGroup
+	var timeouts atomic.Int32
+	var successes atomic.Int32
+	var errors atomic.Int32
+
+	startTime := time.Now()
+
+	for i := 0; i < numRequests; i++ {
+		wg.Add(1)
+		go func(id int) {
+			defer wg.Done()
+
+			ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
+			defer cancel()
+
+			_, err := client.Get(ctx, fmt.Sprintf("key-%d", id)).Result()
+
+			if err != nil {
+				if ctx.Err() == context.DeadlineExceeded || err == context.DeadlineExceeded {
+					timeouts.Add(1)
+				} else {
+					errors.Add(1)
+				}
+			} else {
+				successes.Add(1)
+			}
+		}(i)
+
+		// Stagger requests slightly to simulate real traffic
+		if i%20 == 0 {
+			time.Sleep(5 * time.Millisecond)
+		}
+	}
+
+	wg.Wait()
+	totalTime := time.Since(startTime)
+
+	timeoutRate := float64(timeouts.Load()) / float64(numRequests) * 100
+	successRate := float64(successes.Load()) / float64(numRequests) * 100
+
+	t.Logf("\n=== BAD CONFIGURATION (Default Settings) ===")
+	t.Logf("Configuration:")
+	t.Logf("  PoolSize: %d", opt.PoolSize)
+	t.Logf("  MaxConcurrentDials: %d", opt.MaxConcurrentDials)
+	t.Logf("  MinIdleConns: %d", opt.MinIdleConns)
+	t.Logf("\nResults:")
+	t.Logf("  Total time: %v", totalTime)
+	t.Logf("  Successes: %d (%.1f%%)", successes.Load(), successRate)
+	t.Logf("  Timeouts: %d (%.1f%%)", timeouts.Load(), timeoutRate)
+	t.Logf("  Other errors: %d", errors.Load())
+	t.Logf("  Total dials: %d (succeeded: %d, failed: %d)", 
+		dialCount.Load(), dialsSucceeded.Load(), dialsFailed.Load())
+
+	// With bad configuration:
+	// - MaxConcurrentDials=80 means only 80 dials can run concurrently
+	// - Each dial takes 300ms, but request timeout is 200ms
+	// - Requests timeout waiting for dial slots
+	// - Expected: High timeout rate (>50%)
+
+	if timeoutRate < 50 {
+		t.Logf("WARNING: Expected high timeout rate (>50%%), got %.1f%%. Test may not be stressing the system enough.", timeoutRate)
+	}
+}
+
+// TestGoodConfigurationHighLoad demonstrates how proper configuration fixes the problem
+func TestGoodConfigurationHighLoad(t *testing.T) {
+	var dialCount atomic.Int32
+	var dialsFailed atomic.Int32
+	var dialsSucceeded atomic.Int32
+	
+	// Same slow dialer - 300ms per dial
+	slowDialer := func(ctx context.Context, network, addr string) (net.Conn, error) {
+		dialCount.Add(1)
+		select {
+		case <-time.After(300 * time.Millisecond):
+			dialsSucceeded.Add(1)
+			return &net.TCPConn{}, nil
+		case <-ctx.Done():
+			dialsFailed.Add(1)
+			return nil, ctx.Err()
+		}
+	}
+
+	// GOOD CONFIGURATION: Tuned for high load
+	opt := &Options{
+		Addr:               "localhost:6379",
+		Dialer:             slowDialer,
+		PoolSize:           300, // Increased from 80
+		MaxConcurrentDials: 300, // Increased from 80
+		MinIdleConns:       50,  // Pre-warm the pool
+		DialTimeout:        5 * time.Second,
+	}
+
+	client := NewClient(opt)
+	defer client.Close()
+
+	// Wait for pool to warm up
+	time.Sleep(100 * time.Millisecond)
+
+	// Same load: 200 concurrent requests with 200ms timeout
+	const numRequests = 200
+	const requestTimeout = 200 * time.Millisecond
+
+	var wg sync.WaitGroup
+	var timeouts atomic.Int32
+	var successes atomic.Int32
+	var errors atomic.Int32
+
+	startTime := time.Now()
+
+	for i := 0; i < numRequests; i++ {
+		wg.Add(1)
+		go func(id int) {
+			defer wg.Done()
+
+			ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
+			defer cancel()
+
+			_, err := client.Get(ctx, fmt.Sprintf("key-%d", id)).Result()
+
+			if err != nil {
+				if ctx.Err() == context.DeadlineExceeded || err == context.DeadlineExceeded {
+					timeouts.Add(1)
+				} else {
+					errors.Add(1)
+				}
+			} else {
+				successes.Add(1)
+			}
+		}(i)
+
+		// Stagger requests slightly
+		if i%20 == 0 {
+			time.Sleep(5 * time.Millisecond)
+		}
+	}
+
+	wg.Wait()
+	totalTime := time.Since(startTime)
+
+	timeoutRate := float64(timeouts.Load()) / float64(numRequests) * 100
+	successRate := float64(successes.Load()) / float64(numRequests) * 100
+
+	t.Logf("\n=== GOOD CONFIGURATION (Tuned Settings) ===")
+	t.Logf("Configuration:")
+	t.Logf("  PoolSize: %d", opt.PoolSize)
+	t.Logf("  MaxConcurrentDials: %d", opt.MaxConcurrentDials)
+	t.Logf("  MinIdleConns: %d", opt.MinIdleConns)
+	t.Logf("\nResults:")
+	t.Logf("  Total time: %v", totalTime)
+	t.Logf("  Successes: %d (%.1f%%)", successes.Load(), successRate)
+	t.Logf("  Timeouts: %d (%.1f%%)", timeouts.Load(), timeoutRate)
+	t.Logf("  Other errors: %d", errors.Load())
+	t.Logf("  Total dials: %d (succeeded: %d, failed: %d)", 
+		dialCount.Load(), dialsSucceeded.Load(), dialsFailed.Load())
+
+	// With good configuration:
+	// - Higher MaxConcurrentDials allows more concurrent dials
+	// - MinIdleConns pre-warms the pool
+	// - Expected: Low timeout rate (<20%)
+
+	if timeoutRate > 20 {
+		t.Errorf("Expected low timeout rate (<20%%), got %.1f%%", timeoutRate)
+	}
+}
+
+// TestConfigurationComparison runs both tests and shows the difference
+func TestConfigurationComparison(t *testing.T) {
+	t.Run("BadConfiguration", TestBadConfigurationHighLoad)
+	t.Run("GoodConfiguration", TestGoodConfigurationHighLoad)
+}
+
@@ -0,0 +1,158 @@
+package pool_test
+
+import (
+	"context"
+	"net"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/redis/go-redis/v9/internal/pool"
+)
+
+// TestDoubleFreeTurnSimple tests the double-free bug with a simple scenario.
+// This test FAILS with the OLD code and PASSES with the NEW code.
+//
+// Scenario:
+// 1. Request A times out, Dial A completes and delivers connection to Request B
+// 2. Request B's own Dial B completes later
+// 3. With the bug: Dial B frees Request B's turn (even though Request B is using connection A)
+// 4. Then Request B calls Put() and frees the turn AGAIN (double-free)
+// 5. This allows more concurrent operations than PoolSize permits
+//
+// Detection method:
+// - Try to acquire PoolSize+1 connections after the double-free
+// - With the bug: All succeed (pool size violated)
+// - With the fix: Only PoolSize succeed
+func TestDoubleFreeTurnSimple(t *testing.T) {
+	ctx := context.Background()
+	
+	var dialCount atomic.Int32
+	dialBComplete := make(chan struct{})
+	requestBGotConn := make(chan struct{})
+	requestBCalledPut := make(chan struct{})
+	
+	controlledDialer := func(ctx context.Context) (net.Conn, error) {
+		count := dialCount.Add(1)
+		
+		if count == 1 {
+			// Dial A: takes 150ms
+			time.Sleep(150 * time.Millisecond)
+			t.Logf("Dial A completed")
+		} else if count == 2 {
+			// Dial B: takes 300ms (longer than Dial A)
+			time.Sleep(300 * time.Millisecond)
+			t.Logf("Dial B completed")
+			close(dialBComplete)
+		} else {
+			// Other dials: fast
+			time.Sleep(10 * time.Millisecond)
+		}
+		
+		return newDummyConn(), nil
+	}
+	
+	testPool := pool.NewConnPool(&pool.Options{
+		Dialer:             controlledDialer,
+		PoolSize:           2, // Only 2 concurrent operations allowed
+		MaxConcurrentDials: 5,
+		DialTimeout:        1 * time.Second,
+		PoolTimeout:        1 * time.Second,
+	})
+	defer testPool.Close()
+	
+	// Request A: Short timeout (100ms), will timeout before dial completes (150ms)
+	go func() {
+		shortCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
+		defer cancel()
+		
+		_, err := testPool.Get(shortCtx)
+		if err != nil {
+			t.Logf("Request A: Timed out as expected: %v", err)
+		}
+	}()
+	
+	// Wait for Request A to start
+	time.Sleep(20 * time.Millisecond)
+	
+	// Request B: Long timeout, will receive connection from Request A's dial
+	requestBDone := make(chan struct{})
+	go func() {
+		defer close(requestBDone)
+		
+		longCtx, cancel := context.WithTimeout(ctx, 1*time.Second)
+		defer cancel()
+		
+		cn, err := testPool.Get(longCtx)
+		if err != nil {
+			t.Errorf("Request B: Should have received connection but got error: %v", err)
+			return
+		}
+		
+		t.Logf("Request B: Got connection from Request A's dial")
+		close(requestBGotConn)
+		
+		// Wait for dial B to complete
+		<-dialBComplete
+
+		t.Logf("Request B: Dial B completed")
+
+		// Wait a bit to allow Dial B goroutine to finish and call freeTurn()
+		time.Sleep(100 * time.Millisecond)
+
+		// Signal that we're ready for the test to check semaphore state
+		close(requestBCalledPut)
+
+		// Wait for the test to check QueueLen
+		time.Sleep(200 * time.Millisecond)
+
+		t.Logf("Request B: Now calling Put()")
+		testPool.Put(ctx, cn)
+		t.Logf("Request B: Put() called")
+	}()
+	
+	// Wait for Request B to get the connection
+	<-requestBGotConn
+
+	// Wait for Dial B to complete and freeTurn() to be called
+	<-requestBCalledPut
+
+	// NOW WE'RE IN THE CRITICAL WINDOW
+	// Request B is holding a connection (from Dial A)
+	// Dial B has completed and returned (freeTurn() has been called)
+	// With the bug:
+	//   - Dial B freed Request B's turn (BUG!)
+	//   - QueueLen should be 0
+	// With the fix:
+	//   - Dial B did NOT free Request B's turn
+	//   - QueueLen should be 1 (Request B still holds the turn)
+
+	t.Logf("\n=== CRITICAL CHECK: QueueLen ===")
+	t.Logf("Request B is holding a connection, Dial B has completed and returned")
+	queueLen := testPool.QueueLen()
+	t.Logf("QueueLen: %d", queueLen)
+
+	// Wait for Request B to finish
+	select {
+	case <-requestBDone:
+	case <-time.After(1 * time.Second):
+		t.Logf("Request B timed out")
+	}
+
+	t.Logf("\n=== Results ===")
+	t.Logf("QueueLen during critical window: %d", queueLen)
+	t.Logf("Expected with fix: 1 (Request B still holds the turn)")
+	t.Logf("Expected with bug: 0 (Dial B freed Request B's turn)")
+
+	if queueLen == 0 {
+		t.Errorf("DOUBLE-FREE BUG DETECTED!")
+		t.Errorf("QueueLen is 0, meaning Dial B freed Request B's turn")
+		t.Errorf("But Request B is still holding a connection, so its turn should NOT be freed yet")
+	} else if queueLen == 1 {
+		t.Logf("✓ CORRECT: QueueLen is 1")
+		t.Logf("Request B is still holding the turn (will be freed when Request B calls Put())")
+	} else {
+		t.Logf("Unexpected QueueLen: %d (expected 1 with fix, 0 with bug)", queueLen)
+	}
+}
+