From 8bb156cd1c5e5e2304a1ec2cbbb37cd350981d83 Mon Sep 17 00:00:00 2001
From: isaacbmiller <isaacbmiller@gmail.com>
Date: Sat, 28 Feb 2026 15:00:37 -0500
Subject: [PATCH] test: add load testing, stress tests, and integration test
 infrastructure

Add a complete performance testing harness:

- Mock OpenAI-compatible LLM server (configurable delay, error rate)
- Fixture dspy-cli project with sync and async modules
- Locust load test with model-routing correctness checks
- Benchmark runner (single scenario) and matrix runner (delay x users)
- Benchmark comparison script (assert_benchmark.py)
- Stress tests: backpressure, error storms, log writer integrity
- Integration tests: concurrent correctness with pytest + httpx
- .gitignore entries for generated results

The integration tests require a running server (not part of normal
pytest runs). The load/stress tests are standalone scripts.

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 .gitignore                                    |   8 +-
 tests/integration/conftest.py                 |  54 ++++
 tests/integration/test_concurrent_requests.py |  76 ++++++
 tests/load/assert_benchmark.py                |  55 ++++
 tests/load/fixture_project/dspy.config.yaml   |  24 ++
 .../src/load_test_app/__init__.py             |   0
 .../src/load_test_app/modules/__init__.py     |   0
 .../load_test_app/modules/async_predict.py    |  13 +
 .../load_test_app/modules/simple_predict.py   |  10 +
 tests/load/locustfile.py                      |  78 ++++++
 tests/load/mock_lm_server.py                  |  83 ++++++
 tests/load/results/.gitkeep                   |   0
 tests/load/run_benchmark.sh                   |  81 ++++++
 tests/load/run_matrix.sh                      | 156 +++++++++++
 tests/load/run_stress_tests.sh                | 248 ++++++++++++++++++
 tests/load/stress_backpressure.py             | 152 +++++++++++
 tests/load/stress_error_storm.py              | 170 ++++++++++++
 tests/load/stress_log_integrity.py            | 170 ++++++++++++
 18 files changed, 1377 insertions(+), 1 deletion(-)
 create mode 100644 tests/integration/conftest.py
 create mode 100644 tests/integration/test_concurrent_requests.py
 create mode 100644 tests/load/assert_benchmark.py
 create mode 100644 tests/load/fixture_project/dspy.config.yaml
 create mode 100644 tests/load/fixture_project/src/load_test_app/__init__.py
 create mode 100644 tests/load/fixture_project/src/load_test_app/modules/__init__.py
 create mode 100644 tests/load/fixture_project/src/load_test_app/modules/async_predict.py
 create mode 100644 tests/load/fixture_project/src/load_test_app/modules/simple_predict.py
 create mode 100644 tests/load/locustfile.py
 create mode 100644 tests/load/mock_lm_server.py
 create mode 100644 tests/load/results/.gitkeep
 create mode 100755 tests/load/run_benchmark.sh
 create mode 100755 tests/load/run_matrix.sh
 create mode 100755 tests/load/run_stress_tests.sh
 create mode 100644 tests/load/stress_backpressure.py
 create mode 100644 tests/load/stress_error_storm.py
 create mode 100644 tests/load/stress_log_integrity.py

diff --git a/.gitignore b/.gitignore
index ad76c5e..08e7880 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,4 +60,10 @@ mlruns
 mlartifacts
 
 # Documentation
-/site/
\ No newline at end of file
+/site/
+
+# Load test results
+tests/load/results/*.csv
+tests/load/results/*.html
+tests/load/fixture_project/openapi.json
+tests/load/fixture_project/logs/*.log
\ No newline at end of file
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
new file mode 100644
index 0000000..d766f22
--- /dev/null
+++ b/tests/integration/conftest.py
@@ -0,0 +1,54 @@
+"""
+Starts mock LLM server and dspy-cli server as subprocess fixtures.
+Tests in this directory require these fixtures.
+"""
+import subprocess
+import time
+import os
+import sys
+
+import httpx
+import pytest
+
+
+MOCK_PORT = 9999
+SERVER_PORT = 8000
+FIXTURE_PROJECT = os.path.join(os.path.dirname(__file__), "..", "load", "fixture_project")
+
+
+@pytest.fixture(scope="session", autouse=True)
+def mock_lm_server():
+    proc = subprocess.Popen(
+        [sys.executable, os.path.join(os.path.dirname(__file__), "..", "load", "mock_lm_server.py")],
+        env={**os.environ, "MOCK_DELAY_MS": "50", "MOCK_PORT": str(MOCK_PORT)},
+    )
+    # Wait for mock server to be ready
+    for _ in range(10):
+        try:
+            httpx.get(f"http://127.0.0.1:{MOCK_PORT}/health", timeout=1)
+            break
+        except Exception:
+            time.sleep(0.5)
+    yield proc
+    proc.terminate()
+    proc.wait()
+
+
+@pytest.fixture(scope="session", autouse=True)
+def dspy_cli_server(mock_lm_server):
+    proc = subprocess.Popen(
+        [sys.executable, "-m", "dspy_cli.server.runner",
+         "--port", str(SERVER_PORT), "--host", "127.0.0.1"],
+        cwd=FIXTURE_PROJECT,
+    )
+    # Wait for server to be ready
+    for _ in range(30):
+        try:
+            resp = httpx.get(f"http://127.0.0.1:{SERVER_PORT}/programs", timeout=2)
+            if resp.status_code == 200:
+                break
+        except Exception:
+            time.sleep(1)
+    yield proc
+    proc.terminate()
+    proc.wait()
diff --git a/tests/integration/test_concurrent_requests.py b/tests/integration/test_concurrent_requests.py
new file mode 100644
index 0000000..5392152
--- /dev/null
+++ b/tests/integration/test_concurrent_requests.py
@@ -0,0 +1,76 @@
+"""
+Concurrent correctness tests. Not a load test — verifies that responses
+are correct under concurrency, not just that the server survives.
+
+Requires a running dspy-cli server + mock LLM. Uses the fixtures in conftest.py.
+"""
+import asyncio
+
+import httpx
+import pytest
+
+BASE_URL = "http://127.0.0.1:8000"
+
+
+async def make_request(client: httpx.AsyncClient, endpoint: str, question: str):
+    response = await client.post(
+        f"{BASE_URL}/{endpoint}",
+        json={"question": question},
+        timeout=30.0
+    )
+    return response
+
+
+@pytest.mark.asyncio
+async def test_sync_module_concurrent_correctness():
+    """20 concurrent requests to sync module should all succeed with valid responses."""
+    async with httpx.AsyncClient() as client:
+        tasks = [
+            make_request(client, "SimplePredict", f"Question {i}")
+            for i in range(20)
+        ]
+        responses = await asyncio.gather(*tasks)
+
+    for i, r in enumerate(responses):
+        assert r.status_code == 200, f"Request {i} failed: {r.text}"
+        assert "answer" in r.json(), f"Request {i} missing 'answer': {r.json()}"
+
+
+@pytest.mark.asyncio
+async def test_async_module_concurrent_correctness():
+    """20 concurrent requests to async module should all succeed."""
+    async with httpx.AsyncClient() as client:
+        tasks = [
+            make_request(client, "AsyncPredict", f"Question {i}")
+            for i in range(20)
+        ]
+        responses = await asyncio.gather(*tasks)
+
+    for i, r in enumerate(responses):
+        assert r.status_code == 200, f"Request {i} failed: {r.text}"
+        assert "answer" in r.json(), f"Request {i} missing 'answer': {r.json()}"
+
+
+@pytest.mark.asyncio
+async def test_no_response_cross_contamination():
+    """
+    Verifies that concurrent requests don't bleed into each other's outputs.
+    Sends requests with distinct questions and checks that answers are independent.
+    This would catch ContextVar leakage or shared state bugs.
+    """
+    questions = [f"Unique question {i} xyzzy" for i in range(10)]
+
+    async with httpx.AsyncClient() as client:
+        tasks = [
+            make_request(client, "AsyncPredict", q)
+            for q in questions
+        ]
+        responses = await asyncio.gather(*tasks)
+
+    for r in responses:
+        assert r.status_code == 200
+        data = r.json()
+        assert "answer" in data
+        # Mock server returns the same canned response, but we're verifying
+        # there's no exception or empty response caused by state mixing.
+        assert data["answer"] != ""
diff --git a/tests/load/assert_benchmark.py b/tests/load/assert_benchmark.py
new file mode 100644
index 0000000..8b8bf53
--- /dev/null
+++ b/tests/load/assert_benchmark.py
@@ -0,0 +1,55 @@
+"""
+Compares two locust CSV result files and fails if performance has regressed.
+Usage: python tests/load/assert_benchmark.py results/baseline_stats.csv results/current_stats.csv
+"""
+import csv
+import sys
+
+
+def load_stats(path):
+    with open(path) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            if row["Name"] == "Aggregated":
+                return {
+                    "rps": float(row["Requests/s"]),
+                    "p95": float(row["95%"]),
+                    "failures": float(row["Failure Count"]) / max(float(row["Request Count"]), 1),
+                }
+    raise ValueError(f"No 'Aggregated' row found in {path}")
+
+
+def main():
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <baseline_stats.csv> <current_stats.csv>")
+        sys.exit(2)
+
+    baseline = load_stats(sys.argv[1])
+    current = load_stats(sys.argv[2])
+
+    rps_change = (current["rps"] - baseline["rps"]) / baseline["rps"]
+    p95_change = (current["p95"] - baseline["p95"]) / baseline["p95"]
+
+    print(f"RPS:      {baseline['rps']:.1f} -> {current['rps']:.1f}  ({rps_change:+.1%})")
+    print(f"P95 (ms): {baseline['p95']:.0f} -> {current['p95']:.0f}  ({p95_change:+.1%})")
+    print(f"Failures: {baseline['failures']:.1%} -> {current['failures']:.1%}")
+
+    errors = []
+    if rps_change < -0.10:
+        errors.append(f"RPS dropped {rps_change:.1%} (threshold: -10%)")
+    if p95_change > 0.20:
+        errors.append(f"P95 increased {p95_change:.1%} (threshold: +20%)")
+    if current["failures"] > 0.01:
+        errors.append(f"Failure rate {current['failures']:.1%} > 1%")
+
+    if errors:
+        print("\nREGRESSION DETECTED:")
+        for e in errors:
+            print(f"  x {e}")
+        sys.exit(1)
+
+    print("\nAll performance gates passed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/load/fixture_project/dspy.config.yaml b/tests/load/fixture_project/dspy.config.yaml
new file mode 100644
index 0000000..e8d1449
--- /dev/null
+++ b/tests/load/fixture_project/dspy.config.yaml
@@ -0,0 +1,24 @@
+app_id: load-test-app
+models:
+  default: model-alpha
+  registry:
+    model-alpha:
+      model: openai/mock-alpha
+      api_base: http://127.0.0.1:9999/v1
+      api_key: mock-key
+      model_type: chat
+      max_tokens: 100
+      temperature: 1.0
+      cache: false
+    model-beta:
+      model: openai/mock-beta
+      api_base: http://127.0.0.1:9999/v1
+      api_key: mock-key
+      model_type: chat
+      max_tokens: 100
+      temperature: 1.0
+      cache: false
+
+program_models:
+  SimplePredict: model-alpha
+  AsyncPredict: model-beta
diff --git a/tests/load/fixture_project/src/load_test_app/__init__.py b/tests/load/fixture_project/src/load_test_app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/load/fixture_project/src/load_test_app/modules/__init__.py b/tests/load/fixture_project/src/load_test_app/modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/load/fixture_project/src/load_test_app/modules/async_predict.py b/tests/load/fixture_project/src/load_test_app/modules/async_predict.py
new file mode 100644
index 0000000..82afbad
--- /dev/null
+++ b/tests/load/fixture_project/src/load_test_app/modules/async_predict.py
@@ -0,0 +1,13 @@
+import dspy
+
+
+class AsyncPredict(dspy.Module):
+    """Same as SimplePredict but with aforward. Used to test async path."""
+    def __init__(self):
+        self.predict = dspy.Predict("question:str -> answer:str")
+
+    def forward(self, question: str) -> dspy.Prediction:
+        return self.predict(question=question)
+
+    async def aforward(self, question: str) -> dspy.Prediction:
+        return await self.predict.acall(question=question)
diff --git a/tests/load/fixture_project/src/load_test_app/modules/simple_predict.py b/tests/load/fixture_project/src/load_test_app/modules/simple_predict.py
new file mode 100644
index 0000000..7ed0d16
--- /dev/null
+++ b/tests/load/fixture_project/src/load_test_app/modules/simple_predict.py
@@ -0,0 +1,10 @@
+import dspy
+
+
+class SimplePredict(dspy.Module):
+    """Single-predict module. Used to test sync fallback path."""
+    def __init__(self):
+        self.predict = dspy.Predict("question:str -> answer:str")
+
+    def forward(self, question: str) -> dspy.Prediction:
+        return self.predict(question=question)
diff --git a/tests/load/locustfile.py b/tests/load/locustfile.py
new file mode 100644
index 0000000..eba9076
--- /dev/null
+++ b/tests/load/locustfile.py
@@ -0,0 +1,78 @@
+"""
+Locust load test for dspy-cli.
+
+Validates both throughput and correctness: each program must receive
+responses routed through its configured model. The mock LLM server
+echoes the model name back in the answer, so we can verify it.
+
+Single-scenario run:
+    locust -f tests/load/locustfile.py \
+           --host http://localhost:8000 \
+           --headless -u 100 -r 10 \
+           --run-time 60s \
+           --csv results/test
+
+Matrix run (preferred):
+    bash tests/load/run_matrix.sh
+"""
+import uuid
+from locust import HttpUser, task, between, events
+
+
+def unique_payload():
+    """Generate a unique question per request to defeat any caching layer."""
+    return {"question": f"What is the capital of France? [{uuid.uuid4().hex[:8]}]"}
+
+
+class SyncModuleUser(HttpUser):
+    """Hits the sync-fallback module (no aforward). Expects model-alpha."""
+    wait_time = between(0.01, 0.1)
+    weight = 1
+
+    @task
+    def call_simple_predict(self):
+        with self.client.post(
+            "/SimplePredict",
+            json=unique_payload(),
+            catch_response=True
+        ) as response:
+            if response.status_code != 200:
+                response.failure(f"Got {response.status_code}: {response.text[:200]}")
+                return
+            body = response.json()
+            answer = body.get("answer", "")
+            if "model=mock-alpha" not in answer:
+                response.failure(
+                    f"SimplePredict model mismatch: expected mock-alpha, got: {answer[:100]}"
+                )
+
+
+class AsyncModuleUser(HttpUser):
+    """Hits the native async module (has aforward). Expects model-beta."""
+    wait_time = between(0.01, 0.1)
+    weight = 1
+
+    @task
+    def call_async_predict(self):
+        with self.client.post(
+            "/AsyncPredict",
+            json=unique_payload(),
+            catch_response=True
+        ) as response:
+            if response.status_code != 200:
+                response.failure(f"Got {response.status_code}: {response.text[:200]}")
+                return
+            body = response.json()
+            answer = body.get("answer", "")
+            if "model=mock-beta" not in answer:
+                response.failure(
+                    f"AsyncPredict model mismatch: expected mock-beta, got: {answer[:100]}"
+                )
+
+
+@events.quitting.add_listener
+def on_quit(environment, **kwargs):
+    """Fail CI if error rate exceeds threshold."""
+    if environment.runner.stats.total.fail_ratio > 0.01:
+        print(f"ERROR: Failure rate {environment.runner.stats.total.fail_ratio:.1%} > 1%")
+        environment.process_exit_code = 1
diff --git a/tests/load/mock_lm_server.py b/tests/load/mock_lm_server.py
new file mode 100644
index 0000000..4f9282e
--- /dev/null
+++ b/tests/load/mock_lm_server.py
@@ -0,0 +1,83 @@
+"""
+Minimal OpenAI-compatible mock server for load testing.
+
+Echoes the requested model name back in the answer so load tests can
+verify that per-program model routing is correct under concurrency.
+
+Environment variables:
+    MOCK_PORT        - Port to listen on (default: 9999)
+    MOCK_DELAY_MS    - Simulated LLM latency in ms (default: 50)
+    MOCK_ERROR_RATE  - Fraction of requests that return 500 (0.0-1.0, default: 0.0)
+"""
+import asyncio
+import os
+import random
+import time
+
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+
+app = FastAPI()
+
+MOCK_DELAY_MS = 50
+
+
+@app.post("/v1/chat/completions")
+async def chat(request: Request):
+    """Accept any JSON body — no strict schema validation.
+
+    LiteLLM sends varying extra fields (stream, n, tools, etc.)
+    depending on the call path. A strict Pydantic model rejects those.
+
+    The response embeds the requested model name in the answer field
+    so callers can verify the correct model was routed.
+    """
+    body = await request.json()
+    model = body.get("model", "unknown")
+    delay = float(os.environ.get("MOCK_DELAY_MS", MOCK_DELAY_MS)) / 1000
+    await asyncio.sleep(delay)
+
+    # Simulate LLM errors
+    error_rate = float(os.environ.get("MOCK_ERROR_RATE", "0.0"))
+    if error_rate > 0 and random.random() < error_rate:
+        return JSONResponse(
+            status_code=500,
+            content={
+                "error": {
+                    "message": "Mock LLM internal error",
+                    "type": "server_error",
+                    "code": "internal_error",
+                }
+            },
+        )
+
+    return {
+        "id": "mock-completion",
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [{
+            "index": 0,
+            "message": {
+                "role": "assistant",
+                "content": f'[[ ## answer ## ]]\nmodel={model}\n\n[[ ## completed ## ]]'
+            },
+            "finish_reason": "stop"
+        }],
+        "usage": {
+            "prompt_tokens": 20,
+            "completion_tokens": 10,
+            "total_tokens": 30
+        }
+    }
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+if __name__ == "__main__":
+    port = int(os.environ.get("MOCK_PORT", 9999))
+    uvicorn.run(app, host="127.0.0.1", port=port)
diff --git a/tests/load/results/.gitkeep b/tests/load/results/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/load/run_benchmark.sh b/tests/load/run_benchmark.sh
new file mode 100755
index 0000000..553d0c3
--- /dev/null
+++ b/tests/load/run_benchmark.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Single-scenario benchmark runner.
+# For multi-scenario matrix, use run_matrix.sh instead.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+RESULTS_DIR="$SCRIPT_DIR/results"
+MOCK_PORT=${MOCK_PORT:-9999}
+SERVER_PORT=${SERVER_PORT:-8000}
+USERS=${USERS:-100}
+SPAWN_RATE=${SPAWN_RATE:-10}
+DURATION=${DURATION:-60s}
+MOCK_DELAY_MS=${MOCK_DELAY_MS:-500}
+LABEL=${LABEL:-"$(git -C "$REPO_ROOT" rev-parse --short HEAD)"}
+
+mkdir -p "$RESULTS_DIR"
+
+kill_port() {
+    # Kill ALL processes listening on a port, including children
+    lsof -ti:"$1" 2>/dev/null | xargs kill -9 2>/dev/null || true
+}
+
+cleanup() {
+    echo "Cleaning up..."
+    kill_port $MOCK_PORT
+    kill_port $SERVER_PORT
+    sleep 1
+}
+trap cleanup EXIT
+
+# 0. Ensure ports are free before starting
+kill_port $MOCK_PORT
+kill_port $SERVER_PORT
+sleep 1
+
+# 1. Start mock LLM server
+echo "Starting mock LLM server on :$MOCK_PORT (delay=${MOCK_DELAY_MS}ms)..."
+MOCK_DELAY_MS=$MOCK_DELAY_MS MOCK_PORT=$MOCK_PORT python "$SCRIPT_DIR/mock_lm_server.py" &
+sleep 1
+
+if ! curl -sf http://127.0.0.1:$MOCK_PORT/health > /dev/null; then
+    echo "ERROR: Mock LLM server failed to start"
+    exit 1
+fi
+echo "Mock LLM server ready."
+
+# 2. Start dspy-cli server against fixture project
+echo "Starting dspy-cli server on :$SERVER_PORT..."
+pushd "$SCRIPT_DIR/fixture_project" > /dev/null
+dspy-cli serve --port $SERVER_PORT --no-reload --no-save-openapi --system &
+popd > /dev/null
+sleep 3
+
+# 3. Wait for server health
+echo "Waiting for server..."
+for i in {1..20}; do
+    if curl -sf http://127.0.0.1:$SERVER_PORT/programs > /dev/null; then
+        echo "Server ready."
+        break
+    fi
+    if [ $i -eq 20 ]; then
+        echo "ERROR: Server failed to start within 20s"
+        exit 1
+    fi
+    sleep 1
+done
+
+# 4. Run load test
+echo "Running load test (users=$USERS, delay=${MOCK_DELAY_MS}ms, duration=$DURATION)..."
+locust -f "$SCRIPT_DIR/locustfile.py" \
+    --host http://127.0.0.1:$SERVER_PORT \
+    --headless \
+    -u $USERS -r $SPAWN_RATE \
+    --run-time $DURATION \
+    --csv "$RESULTS_DIR/$LABEL" \
+    --html "$RESULTS_DIR/$LABEL.html"
+
+echo "Results written to $RESULTS_DIR/$LABEL*.csv"
+echo "Done."
diff --git a/tests/load/run_matrix.sh b/tests/load/run_matrix.sh
new file mode 100755
index 0000000..b7c4f21
--- /dev/null
+++ b/tests/load/run_matrix.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Matrix benchmark: runs multiple delay x user-count scenarios.
+#
+# Each scenario boots fresh servers to avoid cross-contamination.
+# Results go to tests/load/results/<label>_<delay>ms_<users>u_stats.csv
+#
+# Usage:
+#   bash tests/load/run_matrix.sh                  # full matrix
+#   LABEL=baseline bash tests/load/run_matrix.sh   # with custom label
+#   DURATION=30s bash tests/load/run_matrix.sh      # shorter runs
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+RESULTS_DIR="$SCRIPT_DIR/results"
+MOCK_PORT=${MOCK_PORT:-9999}
+SERVER_PORT=${SERVER_PORT:-8000}
+DURATION=${DURATION:-60s}
+SPAWN_RATE=${SPAWN_RATE:-10}
+BASE_LABEL=${LABEL:-"$(git -C "$REPO_ROOT" rev-parse --short HEAD)"}
+
+# Matrix dimensions
+DELAYS=(0 50 200 500 2000)
+USERS_LIST=(50 100 200)
+# 2000ms delay at 200 users = 400 threads needed, skip to keep runs reasonable
+SKIP_2000_200=true
+
+mkdir -p "$RESULTS_DIR"
+
+kill_port() {
+    lsof -ti:"$1" 2>/dev/null | xargs kill -9 2>/dev/null || true
+}
+
+SUMMARY_FILE="$RESULTS_DIR/${BASE_LABEL}_matrix_summary.csv"
+echo "delay_ms,users,rps,p50_ms,p95_ms,p99_ms,max_ms,failure_pct,total_requests" > "$SUMMARY_FILE"
+
+run_scenario() {
+    local delay=$1
+    local users=$2
+    local label="${BASE_LABEL}_${delay}ms_${users}u"
+
+    echo ""
+    echo "================================================================"
+    echo "  Scenario: delay=${delay}ms, users=${users}"
+    echo "================================================================"
+
+    # Ensure ports are free
+    kill_port $MOCK_PORT
+    kill_port $SERVER_PORT
+    sleep 1
+
+    # Start mock LLM server
+    MOCK_DELAY_MS=$delay MOCK_PORT=$MOCK_PORT python "$SCRIPT_DIR/mock_lm_server.py" &
+    sleep 1
+
+    if ! curl -sf http://127.0.0.1:$MOCK_PORT/health > /dev/null; then
+        echo "ERROR: Mock server failed to start for delay=${delay}ms"
+        kill_port $MOCK_PORT
+        return 1
+    fi
+
+    # Start dspy-cli server
+    pushd "$SCRIPT_DIR/fixture_project" > /dev/null
+    dspy-cli serve --port $SERVER_PORT --no-reload --no-save-openapi --system &
+    popd > /dev/null
+    sleep 3
+
+    # Wait for server
+    local ready=false
+    for i in {1..20}; do
+        if curl -sf http://127.0.0.1:$SERVER_PORT/programs > /dev/null; then
+            ready=true
+            break
+        fi
+        sleep 1
+    done
+
+    if [ "$ready" = false ]; then
+        echo "ERROR: Server failed to start for scenario delay=${delay}ms users=${users}"
+        kill_port $MOCK_PORT
+        kill_port $SERVER_PORT
+        sleep 1
+        return 1
+    fi
+
+    # Run locust
+    locust -f "$SCRIPT_DIR/locustfile.py" \
+        --host http://127.0.0.1:$SERVER_PORT \
+        --headless \
+        -u $users -r $SPAWN_RATE \
+        --run-time $DURATION \
+        --csv "$RESULTS_DIR/$label" \
+        --html "$RESULTS_DIR/$label.html" \
+        2>&1 || true
+
+    # Extract summary from CSV
+    if [ -f "$RESULTS_DIR/${label}_stats.csv" ]; then
+        local agg_line
+        agg_line=$(grep "Aggregated" "$RESULTS_DIR/${label}_stats.csv" || echo "")
+        if [ -n "$agg_line" ]; then
+            local rps p50 p95 p99 max_rt req_count fail_count fail_pct
+            rps=$(echo "$agg_line" | awk -F',' '{print $10}')
+            p50=$(echo "$agg_line" | awk -F',' '{print $12}')
+            p95=$(echo "$agg_line" | awk -F',' '{print $17}')
+            p99=$(echo "$agg_line" | awk -F',' '{print $19}')
+            max_rt=$(echo "$agg_line" | awk -F',' '{print $8}')
+            req_count=$(echo "$agg_line" | awk -F',' '{print $3}')
+            fail_count=$(echo "$agg_line" | awk -F',' '{print $4}')
+            if [ "$req_count" -gt 0 ] 2>/dev/null; then
+                fail_pct=$(python3 -c "print(f'{$fail_count/$req_count*100:.2f}')")
+            else
+                fail_pct="0.00"
+            fi
+            echo "$delay,$users,$rps,$p50,$p95,$p99,$max_rt,$fail_pct,$req_count" >> "$SUMMARY_FILE"
+            echo "  -> RPS: $rps | P50: ${p50}ms | P95: ${p95}ms | P99: ${p99}ms | Failures: ${fail_pct}%"
+        fi
+    fi
+
+    # Teardown — kill by port, not PID
+    kill_port $SERVER_PORT
+    kill_port $MOCK_PORT
+    sleep 1
+}
+
+# Ensure clean state before starting
+kill_port $MOCK_PORT
+kill_port $SERVER_PORT
+sleep 1
+
+echo "Running benchmark matrix (${#DELAYS[@]} delays x ${#USERS_LIST[@]} user counts)"
+echo "Duration per scenario: $DURATION"
+echo "Results directory: $RESULTS_DIR"
+echo "Label: $BASE_LABEL"
+
+for delay in "${DELAYS[@]}"; do
+    for users in "${USERS_LIST[@]}"; do
+        # Skip 2000ms/200u scenario if configured
+        if [ "$SKIP_2000_200" = true ] && [ "$delay" -eq 2000 ] && [ "$users" -eq 200 ]; then
+            echo ""
+            echo "  Skipping delay=2000ms users=200 (would need 400 threads)"
+            continue
+        fi
+        run_scenario "$delay" "$users"
+    done
+done
+
+echo ""
+echo "================================================================"
+echo "  Matrix complete. Summary:"
+echo "================================================================"
+echo ""
+column -t -s',' "$SUMMARY_FILE" 2>/dev/null || cat "$SUMMARY_FILE"
+echo ""
+echo "Full summary: $SUMMARY_FILE"
+echo "Individual results: $RESULTS_DIR/${BASE_LABEL}_*_stats.csv"
diff --git a/tests/load/run_stress_tests.sh b/tests/load/run_stress_tests.sh
new file mode 100755
index 0000000..b247368
--- /dev/null
+++ b/tests/load/run_stress_tests.sh
@@ -0,0 +1,248 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Stress test harness.
+# Runs log integrity, error storm, and backpressure tests sequentially,
+# each with its own server configuration. Reports pass/fail summary.
+#
+# Usage:
+#     bash tests/load/run_stress_tests.sh
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+FIXTURE_DIR="$SCRIPT_DIR/fixture_project"
+CONFIG_FILE="$FIXTURE_DIR/dspy.config.yaml"
+CONFIG_BACKUP="$CONFIG_FILE.bak"
+MOCK_PORT=${MOCK_PORT:-9999}
+SERVER_PORT=${SERVER_PORT:-8000}
+
+# Test-specific settings
+LOG_INTEGRITY_REQUESTS=${LOG_INTEGRITY_REQUESTS:-200}
+LOG_INTEGRITY_WORKERS=${LOG_INTEGRITY_WORKERS:-30}
+ERROR_STORM_REQUESTS=${ERROR_STORM_REQUESTS:-200}
+ERROR_STORM_WORKERS=${ERROR_STORM_WORKERS:-30}
+BACKPRESSURE_BURST=${BACKPRESSURE_BURST:-50}
+
+# Track results (bash 3.2 compatible — no associative arrays)
+RESULT_LOG_INTEGRITY="SKIP"
+RESULT_ERROR_STORM="SKIP"
+RESULT_BACKPRESSURE="SKIP"
+
+kill_port() {
+    lsof -ti:"$1" 2>/dev/null | xargs kill -9 2>/dev/null || true
+}
+
+cleanup() {
+    echo ""
+    echo "Cleaning up..."
+    kill_port $MOCK_PORT
+    kill_port $SERVER_PORT
+    # Restore original config if backup exists
+    if [ -f "$CONFIG_BACKUP" ]; then
+        mv "$CONFIG_BACKUP" "$CONFIG_FILE"
+    fi
+    sleep 1
+}
+trap cleanup EXIT
+
+wait_for_server() {
+    local port=$1
+    local url=$2
+    local name=$3
+    local max_wait=${4:-20}
+    for i in $(seq 1 $max_wait); do
+        if curl -sf "$url" > /dev/null 2>&1; then
+            echo "  $name ready."
+            return 0
+        fi
+        sleep 1
+    done
+    echo "  ERROR: $name failed to start within ${max_wait}s"
+    return 1
+}
+
+start_mock_server() {
+    local delay_ms=${1:-50}
+    local error_rate=${2:-0.0}
+    echo "  Starting mock LLM (delay=${delay_ms}ms, error_rate=${error_rate})..."
+    MOCK_DELAY_MS=$delay_ms MOCK_ERROR_RATE=$error_rate MOCK_PORT=$MOCK_PORT \
+        python "$SCRIPT_DIR/mock_lm_server.py" > /dev/null 2>&1 &
+    disown
+    wait_for_server $MOCK_PORT "http://127.0.0.1:$MOCK_PORT/health" "Mock LLM"
+}
+
+start_dspy_server() {
+    echo "  Starting dspy-cli server..."
+    pushd "$FIXTURE_DIR" > /dev/null
+    dspy-cli serve --port $SERVER_PORT --no-reload --no-save-openapi --system > /dev/null 2>&1 &
+    disown
+    popd > /dev/null
+    wait_for_server $SERVER_PORT "http://127.0.0.1:$SERVER_PORT/programs" "dspy-cli server" 30
+}
+
+stop_servers() {
+    kill_port $MOCK_PORT
+    kill_port $SERVER_PORT
+    sleep 1
+}
+
+write_config() {
+    # Write a config with optional server section overrides
+    local max_concurrent=${1:-""}
+    cat > "$CONFIG_FILE" <<YAML
+app_id: load-test-app
+models:
+  default: model-alpha
+  registry:
+    model-alpha:
+      model: openai/mock-alpha
+      api_base: http://127.0.0.1:${MOCK_PORT}/v1
+      api_key: mock-key
+      model_type: chat
+      max_tokens: 100
+      temperature: 1.0
+      cache: false
+    model-beta:
+      model: openai/mock-beta
+      api_base: http://127.0.0.1:${MOCK_PORT}/v1
+      api_key: mock-key
+      model_type: chat
+      max_tokens: 100
+      temperature: 1.0
+      cache: false
+
+program_models:
+  SimplePredict: model-alpha
+  AsyncPredict: model-beta
+YAML
+
+    if [ -n "$max_concurrent" ]; then
+        cat >> "$CONFIG_FILE" <<YAML
+
+server:
+  max_concurrent_per_program: ${max_concurrent}
+YAML
+    fi
+}
+
+# ============================================================
+# Save original config
+# ============================================================
+cp "$CONFIG_FILE" "$CONFIG_BACKUP"
+
+# ============================================================
+# Test 1: Log Integrity
+# Normal config, no errors, verify JSONL logs
+# ============================================================
+echo ""
+echo "============================================================"
+echo "  TEST: Log Integrity"
+echo "============================================================"
+
+write_config
+start_mock_server 50 0.0
+start_dspy_server
+
+if python "$SCRIPT_DIR/stress_log_integrity.py" \
+    --port $SERVER_PORT \
+    --requests $LOG_INTEGRITY_REQUESTS \
+    --workers $LOG_INTEGRITY_WORKERS \
+    --logs-dir "$FIXTURE_DIR/logs"; then
+    RESULT_LOG_INTEGRITY="PASS"
+    echo "  => Log Integrity: PASS"
+else
+    RESULT_LOG_INTEGRITY="FAIL"
+    echo "  => Log Integrity: FAIL"
+fi
+
+stop_servers
+
+# ============================================================
+# Test 2: Error Storm
+# Mock LLM returns 30% errors
+# ============================================================
+echo ""
+echo "============================================================"
+echo "  TEST: Error Storm"
+echo "============================================================"
+
+write_config
+start_mock_server 50 0.9
+start_dspy_server
+
+if python "$SCRIPT_DIR/stress_error_storm.py" \
+    --port $SERVER_PORT \
+    --requests $ERROR_STORM_REQUESTS \
+    --workers $ERROR_STORM_WORKERS; then
+    RESULT_ERROR_STORM="PASS"
+    echo "  => Error Storm: PASS"
+else
+    RESULT_ERROR_STORM="FAIL"
+    echo "  => Error Storm: FAIL"
+fi
+
+stop_servers
+
+# ============================================================
+# Test 3: Backpressure
+# Low semaphore (3), moderate LLM delay (1000ms), burst of 50
+# ============================================================
+echo ""
+echo "============================================================"
+echo "  TEST: Backpressure"
+echo "============================================================"
+
+write_config 3
+start_mock_server 1000 0.0
+start_dspy_server
+
+if python "$SCRIPT_DIR/stress_backpressure.py" \
+    --port $SERVER_PORT \
+    --burst $BACKPRESSURE_BURST \
+    --workers $BACKPRESSURE_BURST; then
+    RESULT_BACKPRESSURE="PASS"
+    echo "  => Backpressure: PASS"
+else
+    RESULT_BACKPRESSURE="FAIL"
+    echo "  => Backpressure: FAIL"
+fi
+
+stop_servers
+
+# ============================================================
+# Summary
+# ============================================================
+echo ""
+echo "========================================"
+echo "  SUMMARY"
+echo "========================================"
+
+PASSED=0
+FAILED=0
+
+for result_var in RESULT_LOG_INTEGRITY RESULT_ERROR_STORM RESULT_BACKPRESSURE; do
+    eval "result=\$$result_var"
+    case $result_var in
+        RESULT_LOG_INTEGRITY) name="Log Integrity" ;;
+        RESULT_ERROR_STORM)   name="Error Storm" ;;
+        RESULT_BACKPRESSURE)  name="Backpressure" ;;
+    esac
+    if [ "$result" = "PASS" ]; then
+        echo "  PASS  $name"
+        PASSED=$((PASSED + 1))
+    else
+        echo "  FAIL  $name"
+        FAILED=$((FAILED + 1))
+    fi
+done
+
+TOTAL=$((PASSED + FAILED))
+echo ""
+echo "  $PASSED/$TOTAL passed"
+
+if [ "$FAILED" -gt 0 ]; then
+    echo "  $FAILED FAILED"
+    exit 1
+else
+    echo "  All tests passed!"
+    exit 0
+fi
diff --git a/tests/load/stress_backpressure.py b/tests/load/stress_backpressure.py
new file mode 100644
index 0000000..0136bc3
--- /dev/null
+++ b/tests/load/stress_backpressure.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""
+Stress test: Backpressure behavior.
+
+Sends a burst of concurrent requests that exceeds the server's
+max_concurrent_per_program limit. Validates:
+  1. Requests that fit in the semaphore succeed (200)
+  2. Excess requests queue and eventually succeed OR get 429 after timeout
+  3. No other error codes appear (no crashes, no 502s)
+  4. Server recovers — health check passes after the burst
+
+The server should be configured with a low max_concurrent_per_program
+(e.g., 3) and the mock LLM with a moderate delay (e.g., 1000ms) so the
+semaphore actually fills up.
+
+Usage:
+    python stress_backpressure.py [--host HOST] [--port PORT] [--burst N] [--workers W]
+"""
+import argparse
+import json
+import sys
+import time
+import urllib.request
+import urllib.error
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+
+def fire_request(host: str, port: int, idx: int) -> dict:
+    """Send one request and return a result dict."""
+    url = f"http://{host}:{port}/SimplePredict"
+    payload = json.dumps({"question": f"backpressure #{idx}"}).encode()
+    req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
+    start = time.time()
+    try:
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            body = json.loads(resp.read())
+            return {"status": resp.status, "duration": time.time() - start, "body": body}
+    except urllib.error.HTTPError as e:
+        try:
+            body = json.loads(e.read())
+        except Exception:
+            body = None
+        return {"status": e.code, "duration": time.time() - start, "body": body}
+    except Exception as e:
+        return {"status": 0, "duration": time.time() - start, "error": str(e)}
+
+
+def check_health(host: str, port: int) -> bool:
+    url = f"http://{host}:{port}/programs"
+    try:
+        with urllib.request.urlopen(url, timeout=10) as resp:
+            return resp.status == 200
+    except Exception:
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backpressure stress test")
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--burst", type=int, default=50, help="Number of simultaneous requests")
+    parser.add_argument("--workers", type=int, default=50, help="Thread pool size (should match burst)")
+    args = parser.parse_args()
+
+    failures = []
+
+    if not check_health(args.host, args.port):
+        print("FAIL: Server not healthy before test")
+        sys.exit(1)
+
+    # Fire a single burst — all at once
+    print(f"Firing burst of {args.burst} simultaneous requests...")
+    start = time.time()
+    results = []
+    with ThreadPoolExecutor(max_workers=args.workers) as pool:
+        futures = [
+            pool.submit(fire_request, args.host, args.port, i)
+            for i in range(args.burst)
+        ]
+        for f in as_completed(futures):
+            results.append(f.result())
+
+    elapsed = time.time() - start
+
+    # Categorize results
+    ok_200 = [r for r in results if r["status"] == 200]
+    rejected_429 = [r for r in results if r["status"] == 429]
+    connection_errors = [r for r in results if r["status"] == 0]
+    other_errors = [r for r in results if r["status"] not in (200, 429, 0)]
+
+    print(f"Completed in {elapsed:.1f}s")
+    print(f"  200 OK:           {len(ok_200)}")
+    print(f"  429 Too Many:     {len(rejected_429)}")
+    print(f"  Connection errors: {len(connection_errors)}")
+    print(f"  Other errors:     {len(other_errors)}")
+
+    if ok_200:
+        durations = [r["duration"] for r in ok_200]
+        print(f"  200 latency:      min={min(durations):.2f}s  max={max(durations):.2f}s  avg={sum(durations)/len(durations):.2f}s")
+
+    if rejected_429:
+        durations = [r["duration"] for r in rejected_429]
+        print(f"  429 latency:      min={min(durations):.2f}s  max={max(durations):.2f}s  avg={sum(durations)/len(durations):.2f}s")
+
+    # Check 1: Some requests succeeded
+    if len(ok_200) == 0:
+        failures.append("FAIL: Zero successes — server may be completely broken")
+
+    # Check 2: No connection errors
+    if connection_errors:
+        failures.append(f"FAIL: {len(connection_errors)} connection errors — server may have crashed")
+
+    # Check 3: No unexpected error codes (502, 503, etc.)
+    if other_errors:
+        codes = [r["status"] for r in other_errors]
+        failures.append(f"FAIL: Unexpected status codes: {codes}")
+
+    # Check 4: 429 responses should NOT be instant (the old sem.locked() bug)
+    # They should take ~30s (the queue timeout) or not appear at all
+    # (if the burst fits within semaphore + queue time).
+    # If 429s appear in under 1 second, the semaphore is rejecting eagerly.
+    instant_429s = [r for r in rejected_429 if r["duration"] < 1.0]
+    if instant_429s:
+        failures.append(
+            f"FAIL: {len(instant_429s)} requests got instant 429 (<1s) — "
+            f"semaphore may be rejecting without queuing"
+        )
+
+    # Check 5: All responses (200 and 429) have well-formed JSON bodies
+    malformed = [r for r in results if r["status"] in (200, 429) and r.get("body") is None]
+    if malformed:
+        failures.append(f"FAIL: {len(malformed)} responses had no JSON body")
+
+    # Check 6: Server recovers
+    time.sleep(1)
+    if not check_health(args.host, args.port):
+        failures.append("FAIL: Server not healthy after burst")
+
+    # Report
+    if failures:
+        print(f"\n{'='*60}")
+        print(f"FAILED — {len(failures)} issue(s):")
+        for f in failures:
+            print(f"  {f}")
+        sys.exit(1)
+    else:
+        print(f"\nPASSED — {len(ok_200)} served, {len(rejected_429)} queued/rejected cleanly, server recovered")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/load/stress_error_storm.py b/tests/load/stress_error_storm.py
new file mode 100644
index 0000000..f97bba2
--- /dev/null
+++ b/tests/load/stress_error_storm.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""
+Stress test: Error storm resilience.
+
+Fires concurrent requests against a server whose LLM backend is returning
+errors for a fraction of requests. Validates:
+  1. Server stays healthy throughout (GET /programs returns 200)
+  2. Error responses are well-formed JSON with status 500
+  3. Successful responses still return correct data
+  4. No requests hang (all complete within timeout)
+  5. Server recovers — health check passes after the storm
+
+Usage:
+    python stress_error_storm.py [--host HOST] [--port PORT] [--requests N] [--workers W]
+"""
+import argparse
+import json
+import sys
+import time
+import urllib.request
+import urllib.error
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+
+def fire_request(host: str, port: int, program: str, idx: int) -> dict:
+    """Send one request and return a result dict."""
+    url = f"http://{host}:{port}/{program}"
+    payload = json.dumps({"question": f"error storm {program} #{idx}"}).encode()
+    req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
+    start = time.time()
+    try:
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            body = json.loads(resp.read())
+            return {
+                "status": resp.status,
+                "body": body,
+                "duration": time.time() - start,
+                "ok": True,
+            }
+    except urllib.error.HTTPError as e:
+        try:
+            body = json.loads(e.read())
+        except Exception:
+            body = None
+        return {
+            "status": e.code,
+            "body": body,
+            "duration": time.time() - start,
+            "ok": False,
+            "well_formed": body is not None,
+        }
+    except Exception as e:
+        return {
+            "status": 0,
+            "body": None,
+            "duration": time.time() - start,
+            "ok": False,
+            "well_formed": False,
+            "error": str(e),
+        }
+
+
+def check_health(host: str, port: int) -> bool:
+    """Check if the server is healthy."""
+    url = f"http://{host}:{port}/programs"
+    try:
+        with urllib.request.urlopen(url, timeout=10) as resp:
+            return resp.status == 200
+    except Exception:
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Error storm stress test")
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--requests", type=int, default=200, help="Total requests to fire")
+    parser.add_argument("--workers", type=int, default=30, help="Concurrent workers")
+    args = parser.parse_args()
+
+    programs = ["SimplePredict", "AsyncPredict"]
+    total = args.requests
+    failures = []
+
+    # Pre-check: server is healthy
+    if not check_health(args.host, args.port):
+        print("FAIL: Server not healthy before test")
+        sys.exit(1)
+
+    # Fire concurrent requests
+    print(f"Firing {total} requests against error-prone LLM ({args.workers} workers)...")
+    start = time.time()
+    results = []
+    with ThreadPoolExecutor(max_workers=args.workers) as pool:
+        futures = []
+        for i in range(total):
+            prog = programs[i % len(programs)]
+            futures.append(pool.submit(fire_request, args.host, args.port, prog, i))
+
+        for f in as_completed(futures):
+            results.append(f.result())
+
+    elapsed = time.time() - start
+
+    # Analyze results
+    successes = [r for r in results if r["ok"]]
+    errors = [r for r in results if not r["ok"]]
+    hung = [r for r in results if r["duration"] > 55]  # close to the 60s timeout
+    malformed = [r for r in errors if not r.get("well_formed", False)]
+    connection_errors = [r for r in results if r["status"] == 0]
+
+    print(f"Completed in {elapsed:.1f}s")
+    print(f"  Successes:         {len(successes)}")
+    print(f"  Server errors:     {len(errors)}")
+    print(f"  Malformed errors:  {len(malformed)}")
+    print(f"  Connection errors: {len(connection_errors)}")
+    print(f"  Hung requests:     {len(hung)}")
+
+    # Check 1: Server didn't completely break — at least some requests completed
+    if len(successes) == 0 and len(errors) == 0:
+        failures.append("FAIL: Zero responses — server may be completely broken")
+    # Note: DSPy/LiteLLM retries on LLM 500s, so even with a high mock error rate
+    # all requests may ultimately succeed. That's fine — the test validates the
+    # server handles the error storm gracefully either way.
+
+    # Check 2: All error responses are well-formed JSON
+    if malformed:
+        failures.append(
+            f"FAIL: {len(malformed)} error responses were not well-formed JSON"
+        )
+
+    # Check 3: No connection errors (server didn't crash)
+    if connection_errors:
+        failures.append(
+            f"FAIL: {len(connection_errors)} connection errors — server may have crashed"
+        )
+
+    # Check 4: No hung requests
+    if hung:
+        failures.append(
+            f"FAIL: {len(hung)} requests took >55s — possible thread/event loop stall"
+        )
+
+    # Check 5: All error responses have status 500 (not 502, 503, etc.)
+    unexpected_statuses = [r["status"] for r in errors if r["status"] not in (500, 429)]
+    if unexpected_statuses:
+        from collections import Counter
+        counts = Counter(unexpected_statuses)
+        failures.append(f"FAIL: Unexpected error statuses: {dict(counts)}")
+
+    # Check 6: Server recovers — health check after storm
+    time.sleep(1)
+    if not check_health(args.host, args.port):
+        failures.append("FAIL: Server not healthy after error storm")
+
+    # Report
+    if failures:
+        print(f"\n{'='*60}")
+        print(f"FAILED — {len(failures)} issue(s):")
+        for f in failures:
+            print(f"  {f}")
+        sys.exit(1)
+    else:
+        error_pct = len(errors) / total * 100 if total > 0 else 0
+        print(f"\nPASSED — {error_pct:.0f}% errors handled cleanly, server recovered")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/load/stress_log_integrity.py b/tests/load/stress_log_integrity.py
new file mode 100644
index 0000000..0ccf3c5
--- /dev/null
+++ b/tests/load/stress_log_integrity.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""
+Stress test: Log writer integrity.
+
+Fires concurrent requests at the server, then parses every line of the
+JSONL log files to verify:
+  1. Every line is valid JSON
+  2. Total log lines == total HTTP requests
+  3. Per-program counts match
+  4. Every log entry has required fields
+
+Usage:
+    python stress_log_integrity.py [--host HOST] [--port PORT] [--requests N] [--workers W] [--logs-dir DIR]
+"""
+import argparse
+import json
+import sys
+import time
+import urllib.request
+import urllib.error
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+
+REQUIRED_LOG_FIELDS = {"timestamp", "program", "model", "duration_ms", "inputs", "outputs", "success"}
+
+
+def fire_request(host: str, port: int, program: str, idx: int) -> dict:
+    """Send one request and return a result dict."""
+    url = f"http://{host}:{port}/{program}"
+    payload = json.dumps({"question": f"stress test {program} #{idx}"}).encode()
+    req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
+    try:
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            return {"program": program, "status": resp.status, "ok": True}
+    except urllib.error.HTTPError as e:
+        return {"program": program, "status": e.code, "ok": False, "error": str(e)}
+    except Exception as e:
+        return {"program": program, "status": 0, "ok": False, "error": str(e)}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Log writer integrity stress test")
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--requests", type=int, default=200, help="Total requests to fire")
+    parser.add_argument("--workers", type=int, default=30, help="Concurrent workers")
+    parser.add_argument("--logs-dir", default=None, help="Path to logs directory")
+    args = parser.parse_args()
+
+    if args.logs_dir:
+        logs_dir = Path(args.logs_dir)
+    else:
+        logs_dir = Path(__file__).parent / "fixture_project" / "logs"
+
+    programs = ["SimplePredict", "AsyncPredict"]
+    total = args.requests
+    failures = []
+    http_results = {"SimplePredict": {"success": 0, "error": 0}, "AsyncPredict": {"success": 0, "error": 0}}
+
+    # Clear existing logs
+    for prog in programs:
+        log_file = logs_dir / f"{prog}.log"
+        if log_file.exists():
+            log_file.unlink()
+
+    # Fire concurrent requests
+    print(f"Firing {total} requests ({args.workers} concurrent workers)...")
+    start = time.time()
+    with ThreadPoolExecutor(max_workers=args.workers) as pool:
+        futures = []
+        for i in range(total):
+            prog = programs[i % len(programs)]
+            futures.append(pool.submit(fire_request, args.host, args.port, prog, i))
+
+        for f in as_completed(futures):
+            result = f.result()
+            if result["ok"]:
+                http_results[result["program"]]["success"] += 1
+            else:
+                http_results[result["program"]]["error"] += 1
+
+    elapsed = time.time() - start
+    total_success = sum(r["success"] for r in http_results.values())
+    total_error = sum(r["error"] for r in http_results.values())
+    print(f"Completed in {elapsed:.1f}s: {total_success} success, {total_error} error")
+
+    # Wait briefly for the log writer to flush
+    time.sleep(2)
+
+    # Validate logs
+    print("\nValidating JSONL logs...")
+    log_counts = {}
+    total_log_lines = 0
+    parse_errors = 0
+    field_errors = 0
+
+    for prog in programs:
+        log_file = logs_dir / f"{prog}.log"
+        if not log_file.exists():
+            if http_results[prog]["success"] + http_results[prog]["error"] > 0:
+                failures.append(f"FAIL: Log file missing for {prog} (expected entries)")
+            continue
+
+        count = 0
+        with open(log_file) as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                count += 1
+
+                # Check 1: Valid JSON
+                try:
+                    entry = json.loads(line)
+                except json.JSONDecodeError as e:
+                    parse_errors += 1
+                    failures.append(f"FAIL: {prog}.log line {line_num}: invalid JSON: {e}")
+                    continue
+
+                # Check 2: Required fields
+                missing = REQUIRED_LOG_FIELDS - set(entry.keys())
+                if missing:
+                    field_errors += 1
+                    failures.append(f"FAIL: {prog}.log line {line_num}: missing fields: {missing}")
+
+                # Check 3: Program name matches file
+                if entry.get("program") != prog:
+                    failures.append(
+                        f"FAIL: {prog}.log line {line_num}: program={entry.get('program')}, expected {prog}"
+                    )
+
+        log_counts[prog] = count
+        total_log_lines += count
+
+    # Check 4: Counts match
+    expected_total = total_success + total_error  # Both success and error get logged
+    if total_log_lines != expected_total:
+        failures.append(
+            f"FAIL: Total log lines ({total_log_lines}) != total requests ({expected_total})"
+        )
+
+    for prog in programs:
+        expected = http_results[prog]["success"] + http_results[prog]["error"]
+        actual = log_counts.get(prog, 0)
+        if actual != expected:
+            failures.append(
+                f"FAIL: {prog} log lines ({actual}) != requests ({expected})"
+            )
+
+    # Report
+    print(f"  Total log lines: {total_log_lines}")
+    print(f"  Parse errors:    {parse_errors}")
+    print(f"  Field errors:    {field_errors}")
+    for prog in programs:
+        print(f"  {prog}: {log_counts.get(prog, 0)} lines (expected {http_results[prog]['success'] + http_results[prog]['error']})")
+
+    if failures:
+        print(f"\n{'='*60}")
+        print(f"FAILED — {len(failures)} issue(s):")
+        for f in failures:
+            print(f"  {f}")
+        sys.exit(1)
+    else:
+        print(f"\nPASSED — {total_log_lines} log lines, all valid, counts match")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()