diff --git a/.env.example b/.env.example
index 3ac171e..c6e7134 100644
--- a/.env.example
+++ b/.env.example
@@ -41,6 +41,10 @@ REDIS_WRITE_TIMEOUT=10
 # ─── Security ─────────────────────────────────────────────────────────────────
 # AES-256 key — must be exactly 32 characters
 ENCRYPTION_KEY=LKyGslR3InLES/EYQiJZcW06KFNMoevUd6kehjtrxPA=
+# Separate AES-256 key for encrypting webhook secrets at rest (also exactly 32
+# characters). Keep this DISTINCT from ENCRYPTION_KEY so leaking one key does
+# not compromise the other. Falls back to ENCRYPTION_KEY when unset.
+WEBHOOK_ENCRYPTION_KEY=Hh1pVq8sTn4mWz2bKx7dRf3yLc6gJe0a
 
 # ─── Storage ──────────────────────────────────────────────────────────────────
 # Set BUCKET_PROVIDER to "gcs" or "s3"
@@ -90,6 +94,23 @@ LOG_LEVEL=INFO
 AUTO_MIGRATE=false
 # Python worker: override default migrations directory
 # MIGRATIONS_DIR=/path/to/migrations
+# Required to apply migration versions 7 and 8 on first bootstrap of a fresh
+# database — they drop or alter existing user data (webhook_registrations,
+# assets.owner_id). Set to "true" ONLY when bootstrapping a clean database;
+# NEVER set this on a database that already contains data — apply the
+# migrations manually instead and review the SQL before running it.
+MIGRATION_ALLOW_DESTRUCTIVE=false
+
+# ─── Idempotency ──────────────────────────────────────────────────────────────
+# How long a stored Idempotency-Key + response is replayable (Go duration).
+IDEMPOTENCY_TTL=24h
+
+# ─── Quotas & rate limits (per tenant) ────────────────────────────────────────
+# Sustained per-tenant request rate (req/s) and token-bucket burst on presign.
+TENANT_RATE_LIMIT_RPS=10
+TENANT_RATE_LIMIT_BURST=20
+# Max assets a tenant may own (0 = unlimited). Over-quota presigns return 403.
+TENANT_ASSET_QUOTA=0
 
 # ─── Worker ───────────────────────────────────────────────────────────────────
 WORKER_ID=
diff --git a/README.md b/README.md
index 80a0a4e..d0ac973 100644
--- a/README.md
+++ b/README.md
@@ -10,13 +10,14 @@ A lightweight, scalable media processing pipeline built with Go and Python. MPip
 ## 🌟 Features
 
 - **RESTful API Server** - High-performance Go server built with Chi router
-- **Asynchronous Processing** - Redis Streams job queue for scalable media processing
+- **Concurrent Processing** - Redis Streams job queue with a **bounded worker pool** (`MAX_CONCURRENT_JOBS`) for parallel media processing — ~2.4× throughput vs single-threaded in load tests
+- **Resilient delivery** - **`XAUTOCLAIM`** consumer-group recovery reclaims messages from dead workers, and poison/over-retried messages are routed to a **dead-letter stream** (`media:jobs:dlq`) instead of being dropped
 - **Pluggable Storage** - GCS and S3/MinIO (any S3-compatible store) behind a single provider abstraction, selected by config
 - **Image Processing** - Automatic generation of optimized, content-addressed image variants (resize, re-encode, format conversion)
 - **Video Processing** - Poster generation, 720p transcode, and preview clips
 - **Database-Backed** - PostgreSQL as the durable source of truth for assets, variants, and jobs
-- **Webhooks** - Registration and delivery tracking tables for outbound event notifications
-- **Observability** - OpenTelemetry tracing + metrics on the API, Prometheus metrics on the worker, with a bundled Grafana/Tempo/Loki/Prometheus stack
+- **Webhooks** - Registration + **concurrent** signed delivery (`WEBHOOK_CONCURRENCY`) with HMAC signatures, exponential-backoff retries, and delivery tracking
+- **Observability** - OpenTelemetry tracing + metrics on the API, Prometheus metrics on the worker, with a bundled Grafana/Tempo/Loki/Prometheus stack and a host-run k6 load harness
 - **Docker & Kubernetes Ready** - Multi-stage images and manifests for containerized deployment
 
 ## 🏗️ Architecture
@@ -46,10 +47,15 @@ Two-service pipeline communicating over **Redis Streams** (`media:jobs`). Postgr
 2. Go server creates the asset + job and returns a presigned upload URL
 3. Client uploads the raw file directly to object storage
 4. Client marks the asset uploaded; the job is enqueued on the Redis stream
-5. Python worker consumes the job, processes media (resize, transcode, optimize)
+5. The Python worker consumes jobs **concurrently** (a bounded pool of `MAX_CONCURRENT_JOBS`), processing media (resize, transcode, optimize)
 6. Variants are written back to object storage (deduplicated by content hash)
 7. Database is updated with asset status and variant metadata
 
+**Resilience:** the worker uses Redis Streams consumer-group semantics — each
+message is acked only after its job succeeds, dead-consumer messages are reclaimed
+with `XAUTOCLAIM`, and poison/over-retried messages are moved to a dead-letter
+stream (`media:jobs:dlq`) for inspection/replay rather than being dropped.
+
 ## 📋 Prerequisites
 
 - **Go** 1.24 or higher
@@ -89,12 +95,15 @@ DB_PASSWORD=your_password
 DB_NAME=mpiper
 DB_SSL_MODE=false
 AUTO_MIGRATE=true            # run embedded SQL migrations on startup
+MIGRATION_ALLOW_DESTRUCTIVE=true   # required on first bootstrap — see warning below
 
 # Redis (transport for the job stream)
 REDIS_CONNECTION_STRING=redis://localhost:6379/0
 
 # Security (must be exactly 32 bytes)
 ENCRYPTION_KEY=change_me_to_a_32_byte_secret____
+# Separate 32-byte key for webhook secrets (falls back to ENCRYPTION_KEY if unset)
+WEBHOOK_ENCRYPTION_KEY=change_me_to_a_diff_32_byte_secret
 
 # Storage — pick a provider
 BUCKET_PROVIDER=gcs          # gcs | s3
@@ -117,14 +126,38 @@ S3_PUBLIC_ENDPOINT_URL=http://localhost:9000
 # Worker
 STREAM_NAME=media:jobs
 JOB_POLL_INTERVAL=1
-MAX_CONCURRENT_JOBS=5
+MAX_CONCURRENT_JOBS=5         # bounded worker-pool size; set ≈ CPU cores per worker
+RECOVERY_MIN_IDLE_MS=120000  # idle threshold before XAUTOCLAIM reclaims a stuck message
+STREAM_DLQ_NAME=media:jobs:dlq
+SHUTDOWN_DRAIN_TIMEOUT=30     # seconds to drain in-flight jobs on SIGTERM
+
+# Webhooks
+WEBHOOK_CONCURRENCY=10        # concurrent signed deliveries per dispatcher tick
+WEBHOOK_BATCH_SIZE=50
+WEBHOOK_POLL_INTERVAL=2s
+WEBHOOK_MAX_ATTEMPTS=5
 ```
 
+> **Tuning `MAX_CONCURRENT_JOBS`:** media work is partly CPU-bound (Pillow/ffmpeg),
+> so set it close to the worker's CPU-core count. Going much higher *oversubscribes*
+> the cores and reduces throughput — load tests showed `mcj=8` on 4 cores was slower
+> than `mcj=4`. Size worker memory to the pool, not the single-threaded baseline.
+
 > The worker reads the same `S3_*` variables as the Go server (falling back to `BUCKET_*`), so one `.env` drives both services.
 
 ### 3. Set Up the Database
 
-Migrations run automatically on startup when `AUTO_MIGRATE=true` — both the Go server and the Python worker apply the embedded SQL migrations. To apply them manually instead:
+Migrations run automatically on startup when `AUTO_MIGRATE=true` — both the Go server and the Python worker apply the embedded SQL migrations.
+
+> **Destructive migrations are gated.** Versions `000007_split_webhook_key` and
+> `000008_assets_owner_not_null` drop or alter existing user data
+> (`webhook_registrations`, `assets.owner_id`). Both runners refuse to apply
+> them unless `MIGRATION_ALLOW_DESTRUCTIVE=true` is set. Set it for local
+> bootstrap on a fresh database, but **never** set it on a database that
+> already contains production data — apply those migrations by hand and review
+> the SQL first.
+
+To apply them manually instead:
 
 ```bash
 createdb mpiper
@@ -163,22 +196,20 @@ python -m worker               # worker
 
 ### 6. Test the API
 
-All `/api/v1` routes require a Bearer token — an AES-256-GCM token carrying a
-user id, signed with `ENCRYPTION_KEY` (see [`pkg/utils/crypt.go`](pkg/utils/crypt.go)).
-Mint one for local testing:
+All `/api/v1` routes require a Bearer **API key** — a scoped, revocable key
+(`mp_<prefix>_<secret>`) stored SHA-256-hashed at rest (see
+[`pkg/utils/apikey.go`](pkg/utils/apikey.go)). Mint one for a tenant with the
+CLI (it prints the key **once**):
 
 ```bash
-TOKEN=$(python3 - <<'PY'
-import base64, os
-from cryptography.hazmat.primitives.ciphers.aead import AESGCM
-key = b"change_me_to_a_32_byte_secret____"   # your 32-byte ENCRYPTION_KEY
-nonce = os.urandom(12)
-ct = AESGCM(key).encrypt(nonce, b"demo-user", None)
-print(base64.urlsafe_b64encode(nonce + ct).rstrip(b"=").decode())
-PY
-)
+TOKEN="$(go run ./cmd/mint-api-key --tenant demo-user)"
+# optional: --expires 720h  --scopes assets:write,webhooks:write
 ```
 
+> The CLI connects to the database using your environment config (`.env.local`
+> in development). For the fully containerized demo, the bundled scripts seed a
+> key directly into the running Postgres — see **Run the demo** below.
+
 Request a presigned upload URL:
 
 ```bash
@@ -271,6 +302,34 @@ All `/api/v1` routes require an `Authorization: Bearer <token>` header (see
 > For MinIO it is `S3_PUBLIC_ENDPOINT_URL` (the client-facing endpoint), so the
 > URL is reachable from wherever the client runs — see [Storage Providers](#storage-providers).
 
+#### Idempotency
+
+`POST /storage/presign` (and the `complete` endpoint) accept an optional
+`Idempotency-Key` header so client retries don't create duplicate assets. The
+first request for a given key runs normally and its response is stored
+(per-tenant, 24h TTL by default — `IDEMPOTENCY_TTL`); a retry with the **same
+key and same body** replays the stored response verbatim (with
+`Idempotent-Replayed: true`). Reusing a key with a **different body** returns
+`422`, and a duplicate that arrives while the first is still in flight returns
+`409`.
+
+```bash
+curl -X POST http://localhost:5010/api/v1/storage/presign \
+  -H "Authorization: Bearer $TOKEN" \
+  -H "Idempotency-Key: 9f1c0b2a-..." \
+  -H "Content-Type: application/json" \
+  -d '{ "fileName": "image.jpg", "contentType": "image/jpeg", "size": 1024000 }'
+```
+
+#### Rate limits & quotas
+
+Presign is rate-limited **per tenant** (token bucket, `TENANT_RATE_LIMIT_RPS`
+sustained / `TENANT_RATE_LIMIT_BURST` burst); exceeding it returns `429` with a
+`Retry-After` header. An optional per-tenant asset quota
+(`TENANT_ASSET_QUOTA`, `0` = unlimited) returns `403` once a tenant is at its
+cap. Limits are isolated per tenant — one tenant hitting its limit does not
+affect another.
+
 ### Mark an asset complete (enqueue processing)
 
 **Endpoint:** `GET /api/v1/assets/{assetId}/complete`
@@ -299,8 +358,9 @@ Register an endpoint to receive processing-lifecycle events.
 
 Deliveries are signed: each POST carries an `X-Webhook-Signature: sha256=<hmac>`
 header computed over the JSON body using your registration `secret` (stored
-encrypted at rest). A background dispatcher delivers pending events with
-exponential-backoff retries and tracks them in the `webhook_deliveries` table.
+encrypted at rest). A background dispatcher delivers pending events **concurrently**
+(bounded by `WEBHOOK_CONCURRENCY`) with exponential-backoff retries and tracks them
+in the `webhook_deliveries` table.
 
 ```bash
 curl -X POST http://localhost:5010/api/v1/webhooks \
@@ -331,8 +391,8 @@ produce variants, fetches a variant back over HTTP, and asserts the
 `job.starting → job.started → job.done` webhooks were delivered. It prints a
 PASS/FAIL summary and exits non-zero on any failure.
 
-Requirements on the host: `bash`, `curl`, `jq`, `docker`, and a `python3` with
-the `cryptography` package (used only to mint the auth token).
+Requirements on the host: `bash`, `curl`, `jq`, `docker`, and a `python3`
+(stdlib only — used to mint an API key seeded into the containerized Postgres).
 
 ## 🔧 Development
 
@@ -359,7 +419,7 @@ mpiper/
 │   └── utils/
 │       └── storagex/    # Storage abstraction (GCS, S3/MinIO)
 ├── worker/
-│   ├── consumer/        # Redis Streams consumer + config
+│   ├── consumer/        # Redis Streams consumer (bounded pool, XAUTOCLAIM recovery, DLQ) + config
 │   ├── processing/      # Image/video processing
 │   ├── storage/         # Storage adapters (base ABC, GCS, S3) + factory
 │   └── utils/           # Worker utilities (metrics)
@@ -477,6 +537,11 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
 - [x] Support for AWS S3 / MinIO storage
 - [x] Webhook delivery with HMAC signing + retry tracking
 - [x] Video transcoding with FFmpeg (poster, 720p, preview)
+- [x] Concurrent worker pool (`MAX_CONCURRENT_JOBS`) — ~2.4× throughput
+- [x] `XAUTOCLAIM` stream recovery + dead-letter stream for poison messages
+- [x] Concurrent webhook delivery (`WEBHOOK_CONCURRENCY`)
+- [x] End-to-end OpenTelemetry tracing, SLOs, Grafana dashboards + k6 load harness
+- [ ] Queue-depth autoscaling (KEDA) — *next*
 - [ ] Support for Azure Blob Storage
 - [ ] Admin dashboard
 - [ ] Batch processing API
diff --git a/cmd/mint-api-key/main.go b/cmd/mint-api-key/main.go
new file mode 100644
index 0000000..c3f0bae
--- /dev/null
+++ b/cmd/mint-api-key/main.go
@@ -0,0 +1,115 @@
+// Command mint-api-key inserts a new API key for a tenant and prints the
+// plaintext key exactly once. There is no HTTP admin surface — keys are minted
+// out-of-band with this tool.
+//
+// Usage:
+//
+//	go run ./cmd/mint-api-key --tenant demo-user [--env development]
+//	                          [--expires 720h] [--scopes assets:write,webhooks:write]
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/rndmcodeguy20/mpiper/internal/config"
+	"github.com/rndmcodeguy20/mpiper/internal/database"
+	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	"github.com/rndmcodeguy20/mpiper/pkg/utils"
+	tenantpkg "github.com/rndmcodeguy20/mpiper/pkg/utils/tenant"
+	"go.uber.org/zap"
+)
+
+func main() {
+	var (
+		tenant  = flag.String("tenant", "", "tenant id the key authenticates as (required)")
+		env     = flag.String("env", envOr("ENV", "development"), "config environment (development|staging|production)")
+		expires = flag.Duration("expires", 0, "optional validity window, e.g. 720h; 0 means never expires")
+		scopes  = flag.String("scopes", "", "optional comma-separated scopes")
+	)
+	flag.Parse()
+
+	if *tenant == "" {
+		fmt.Fprintln(os.Stderr, "error: --tenant is required")
+		flag.Usage()
+		os.Exit(2)
+	}
+	if !tenantpkg.IsValidSlug(*tenant) {
+		fmt.Fprintf(os.Stderr, "error: --tenant %q is not a valid tenant identifier (allowed: a-z, 0-9, _, -; max 64 chars)\n", *tenant)
+		flag.Usage()
+		os.Exit(2)
+	}
+
+	cfg, err := config.InitializeConfig(config.ToEnvironment(*env))
+	if err != nil {
+		fatalf("load config: %v", err)
+	}
+	config.Init(cfg)
+
+	db, err := database.NewPostgresDB(cfg.DB)
+	if err != nil {
+		fatalf("connect db: %v", err)
+	}
+	defer func() { _ = db.Close() }()
+
+	mat, err := utils.GenerateAPIKey()
+	if err != nil {
+		fatalf("generate key: %v", err)
+	}
+
+	var scopeList []string
+	if s := strings.TrimSpace(*scopes); s != "" {
+		for _, sc := range strings.Split(s, ",") {
+			if t := strings.TrimSpace(sc); t != "" {
+				scopeList = append(scopeList, t)
+			}
+		}
+	}
+
+	var expiresAt *time.Time
+	if *expires > 0 {
+		t := time.Now().Add(*expires).UTC()
+		expiresAt = &t
+	}
+
+	repo := repository.NewAPIKeyRepository(db, zap.NewNop())
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	id, err := repo.Create(ctx, *tenant, mat.Hash, mat.Prefix, scopeList, expiresAt)
+	if err != nil {
+		fatalf("insert key: %v", err)
+	}
+
+	exp := "never"
+	if expiresAt != nil {
+		exp = expiresAt.Format(time.RFC3339)
+	}
+
+	// Human-readable summary to stderr; the bare key to stdout so it can be
+	// captured cleanly: KEY=$(go run ./cmd/mint-api-key --tenant t)
+	fmt.Fprintf(os.Stderr, "API key created\n")
+	fmt.Fprintf(os.Stderr, "  id:      %s\n", id)
+	fmt.Fprintf(os.Stderr, "  tenant:  %s\n", *tenant)
+	fmt.Fprintf(os.Stderr, "  prefix:  %s\n", mat.Prefix)
+	fmt.Fprintf(os.Stderr, "  scopes:  %v\n", scopeList)
+	fmt.Fprintf(os.Stderr, "  expires: %s\n", exp)
+	fmt.Fprintf(os.Stderr, "  (the key below is shown ONCE and is not recoverable)\n")
+	fmt.Println(mat.Full)
+}
+
+func fatalf(format string, args ...any) {
+	fmt.Fprintf(os.Stderr, "mint-api-key: "+format+"\n", args...)
+	os.Exit(1)
+}
+
+func envOr(key, def string) string {
+	if v := os.Getenv(key); v != "" {
+		return v
+	}
+	return def
+}
diff --git a/cmd/server/main.go b/cmd/server/main.go
index bb4f886..c9f3fc3 100644
--- a/cmd/server/main.go
+++ b/cmd/server/main.go
@@ -2,6 +2,7 @@ package main
 
 import (
 	"context"
+	"database/sql"
 	"errors"
 	"fmt"
 	"net/http"
@@ -100,7 +101,7 @@ func main() {
 
 	if cfg.AutoMigrate {
 		baseLogger.Info("AUTO_MIGRATE=true: running migrations")
-		if err := database.RunMigrations(db.DB); err != nil {
+		if err := database.RunMigrations(db.DB, cfg.MigrationAllowDestructive); err != nil {
 			baseLogger.Sugar().Fatalf("Migration failed: %v", err)
 		}
 		baseLogger.Info("Migrations applied successfully")
@@ -125,6 +126,13 @@ func main() {
 	_ = m.RegisterOutboxPendingFunc(func(ctx context.Context) (int64, error) {
 		return outboxRepo.CountPending(ctx)
 	})
+
+	// Observe the database connection-pool stats (in-use / idle / open / max /
+	// wait count). sqlx.DB embeds *sql.DB, so db.Stats() exposes pool saturation
+	// — the key signal for whether the DB pool is a bottleneck under load.
+	_ = m.RegisterDBStatsFunc(func() sql.DBStats {
+		return db.Stats()
+	})
 	go relay.Start(serverCtx)
 	go relay.StartCleanup(serverCtx, cfg.Outbox.Retention)
 
@@ -134,9 +142,10 @@ func main() {
 		BatchSize:     cfg.Webhook.BatchSize,
 		Timeout:       cfg.Webhook.Timeout,
 		MaxAttempts:   cfg.Webhook.MaxAttempts,
-		EncryptionKey: cfg.EncryptionKey,
+		EncryptionKey: cfg.WebhookEncryptionKey,
 		Retention:     cfg.Webhook.Retention,
-	})
+		Concurrency:   cfg.Webhook.Concurrency,
+	}, m)
 	go webhookDispatcher.Start(serverCtx)
 	go webhookDispatcher.StartCleanup(serverCtx)
 
@@ -146,6 +155,31 @@ func main() {
 		return count, err
 	})
 
+	// --- Idempotency key TTL sweep ---
+	// Periodically purge expired idempotency keys so the table doesn't grow
+	// unbounded. Interval is a fraction of the TTL (min 1 minute).
+	idempotencyRepo := repository.NewIdempotencyRepository(db, baseLogger)
+	go func() {
+		interval := cfg.IdempotencyTTL / 24
+		if interval < time.Minute {
+			interval = time.Minute
+		}
+		ticker := time.NewTicker(interval)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-serverCtx.Done():
+				return
+			case <-ticker.C:
+				if n, err := idempotencyRepo.DeleteExpired(serverCtx); err != nil {
+					baseLogger.Sugar().Errorf("idempotency sweep failed: %v", err)
+				} else if n > 0 {
+					baseLogger.Sugar().Infof("idempotency sweep: deleted %d expired keys", n)
+				}
+			}
+		}
+	}()
+
 	srv := server.NewServer(db, cfg, m)
 	go func() {
 		if err := srv.Start(); err != nil && !errors.Is(err, http.ErrServerClosed) {
diff --git a/deploy/k8s/secrets.yaml b/deploy/k8s/secrets.yaml
index 9997941..e1967a2 100644
--- a/deploy/k8s/secrets.yaml
+++ b/deploy/k8s/secrets.yaml
@@ -12,6 +12,12 @@ stringData:
   # Redis password (if required)
   REDIS_PASSWORD: ""
 
+  # AES-256 keys — each must be exactly 32 bytes. Keep WEBHOOK_ENCRYPTION_KEY
+  # DISTINCT from ENCRYPTION_KEY (auth vs webhook-secret encryption); it falls
+  # back to ENCRYPTION_KEY only when unset.
+  ENCRYPTION_KEY: "CHANGE_ME_TO_A_32_BYTE_SECRET___"
+  WEBHOOK_ENCRYPTION_KEY: "CHANGE_ME_TO_A_DIFF_32_BYTE_KEY_"
+
   # GCS Service Account JSON (base64 encode your actual JSON file)
   # Create with: kubectl create secret generic mpiper-secrets --from-file=gcs-credentials=./service-account.json -n mpiper
 ---
diff --git a/docker-compose.loadtest.yml b/docker-compose.loadtest.yml
new file mode 100644
index 0000000..499e1db
--- /dev/null
+++ b/docker-compose.loadtest.yml
@@ -0,0 +1,68 @@
+# ============================================================================
+# MPiper — Load-test overlay (Track 3, Phase 0)
+#
+#   docker compose \
+#     -f docker-compose.yml \
+#     -f docker-compose.observability.yml \
+#     -f docker-compose.loadtest.yml \
+#     up -d --build
+#
+# Why this exists
+# ---------------
+# Local load-test results are only interpretable when resources are PINNED, so
+# the bottleneck is a stable, observable fact rather than something that moves
+# run to run with spare laptop cores. This overlay caps CPU/memory on `api` and
+# `worker` and forces TRACE_SAMPLING_RATE=1.0 (every asset traced).
+#
+# Tunable knobs (env, with baseline defaults) — flip these for an A/B on the
+# SAME binary without editing this file or adding new overlays:
+#
+#   WORKER_CPUS           worker CPU limit            (default 1.0)
+#   WORKER_MEM            worker memory limit         (default 1024M)
+#   MAX_CONCURRENT_JOBS   worker pool size            (default 1 → serial baseline)
+#   WEBHOOK_CONCURRENCY   webhook delivery fan-out    (default 1 → serial baseline)
+#   JOB_POLL_INTERVAL     worker idle poll (s)        (default 1)
+#
+# Example A/B (concurrent worker + webhooks), same image, fixed core budget:
+#   BEFORE: WORKER_CPUS=4 MAX_CONCURRENT_JOBS=1 WEBHOOK_CONCURRENCY=1  docker compose … up -d --force-recreate worker api
+#   AFTER:  WORKER_CPUS=4 MAX_CONCURRENT_JOBS=8 WEBHOOK_CONCURRENCY=10 docker compose … up -d --force-recreate worker api
+#
+# Defaults reproduce the single-threaded Track-1 baseline. Record whatever knob
+# values you used alongside every experiment — they change the ceiling.
+# ============================================================================
+
+name: mpiper
+
+services:
+  api:
+    environment:
+      # Full sampling locally — never lose a trace to the 0.1 code default.
+      TRACE_SAMPLING_RATE: "1.0"
+      # Webhook delivery fan-out. Default 1 = serial baseline; raise for the A/B.
+      WEBHOOK_CONCURRENCY: "${WEBHOOK_CONCURRENCY:-1}"
+    deploy:
+      resources:
+        limits:
+          cpus: "1.0"
+          memory: 512M
+        reservations:
+          cpus: "0.25"
+          memory: 128M
+
+  worker:
+    environment:
+      # Full sampling locally; matches the API so the whole trace is captured.
+      TRACE_SAMPLING_RATE: "1.0"
+      # Worker pool size. Default 1 = single-threaded baseline; raise for the A/B.
+      MAX_CONCURRENT_JOBS: "${MAX_CONCURRENT_JOBS:-1}"
+      JOB_POLL_INTERVAL: "${JOB_POLL_INTERVAL:-1}"
+    deploy:
+      resources:
+        limits:
+          # Default 1 CPU surfaces the single-threaded baseline; set WORKER_CPUS
+          # higher (e.g. 4) to give the pool real cores for the concurrency A/B.
+          cpus: "${WORKER_CPUS:-1.0}"
+          memory: "${WORKER_MEM:-1024M}"
+        reservations:
+          cpus: "0.5"
+          memory: 256M
diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml
index 65d4760..7ff8300 100644
--- a/docker-compose.observability.yml
+++ b/docker-compose.observability.yml
@@ -21,7 +21,7 @@ services:
   # Grafana Tempo - Distributed Tracing Backend
   # ==========================================================================
   tempo:
-    image: grafana/tempo:latest
+    image: grafana/tempo:2.6.1
     container_name: mpiper-tempo
     command: ["-config.file=/etc/tempo.yaml"]
     volumes:
@@ -102,8 +102,14 @@ services:
       - '--web.console.libraries=/usr/share/prometheus/console_libraries'
       - '--web.console.templates=/usr/share/prometheus/consoles'
       - '--web.enable-lifecycle'
+      # Accept k6's Prometheus remote-write output (Track 3, Phase 4) so client-
+      # side load metrics land in the same Prometheus as the server-side ones.
+      - '--web.enable-remote-write-receiver'
+      # Exemplar storage powers the histogram-bucket -> Tempo trace links.
+      - '--enable-feature=exemplar-storage'
     volumes:
       - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./observability/prometheus.rules.yml:/etc/prometheus/prometheus.rules.yml
       - prometheus-data:/prometheus
     ports:
       - "9090:9090"
diff --git a/docker-compose.webhooks.yml b/docker-compose.webhooks.yml
index e9b5200..5b7d6eb 100644
--- a/docker-compose.webhooks.yml
+++ b/docker-compose.webhooks.yml
@@ -9,7 +9,9 @@ services:
     image: mendhak/http-https-echo:latest
     container_name: mpiper-webhook-receiver
     ports:
-      - "8888:8080"
+      # Host 8899 (not 8888 — that collides with the otel-collector). Internal
+      # delivery uses the docker-network name http://webhook-receiver:8080.
+      - "8899:8080"
     environment:
       HTTP_PORT: 8080
     networks:
diff --git a/docs/arch/abr-transcoding-pipeline.md b/docs/arch/abr-transcoding-pipeline.md
new file mode 100644
index 0000000..80fb1d2
--- /dev/null
+++ b/docs/arch/abr-transcoding-pipeline.md
@@ -0,0 +1,94 @@
+# ADR 0003: Adaptive Bitrate Transcoding Pipeline
+
+## Status
+Accepted
+
+## Context
+Current pipeline produces single 720p H.264 output (`worker/processing/videos.py:transcode_720p`). Need multi-bitrate HLS/DASH for adaptive streaming across devices and network conditions.
+
+## Decision
+
+### ABR Ladder (H.264 baseline → high profiles)
+| Variant | Resolution | Bitrate | Audio | Profile | Use Case |
+|---------|------------|---------|-------|---------|----------|
+| 1080p   | 1920×1080  | 5000k   | 192k  | high    | Desktop, TV |
+| 720p    | 1280×720   | 2800k   | 128k  | high    | Tablet, Mobile WiFi |
+| 480p    | 854×480    | 1400k   | 128k  | main    | Mobile 3G/4G |
+| 360p    | 640×360    | 800k    | 96k   | baseline| Mobile 2G, fallback |
+| 240p    | 426×240    | 400k    | 64k   | baseline| Audio-only fallback |
+
+### Encoding Strategy
+- **Single-pass filter_complex** — one ffmpeg invocation encodes all rungs simultaneously via `split` + parallel `scale` + `encode` filtergraph
+- **Segment duration**: 6s (VOD), 2s (Live/LL-HLS)
+- **Segment format**: fMP4 (CMAF) for HLS + DASH shared segments
+- **Manifests**: HLS v7 (`#EXT-X-VERSION:7`) + DASH MPD (ISO BMFF)
+- **Deduplication**: Content-hash based (already implemented in `processor.py:check_for_duplicate`)
+
+### Hardware Acceleration
+| Platform | Encoder | Detection |
+|----------|---------|-----------|
+| NVIDIA GPU | h264_nvenc | `nvidia-smi` / `ffmpeg -encoders` |
+| Intel QSV | h264_qsv | `vainfo` / `ffmpeg -encoders` |
+| AMD VCN | h264_amf | `rocminfo` / `ffmpeg -encoders` |
+| Apple VT | h264_videotoolbox | `system_profiler` / `ffmpeg -encoders` |
+| Fallback | libx264 | Always available |
+
+Selection logic: prefer HW encoder → fallback to libx264 `preset=fast`/`crf=22`.
+
+### Output Structure (Object Storage)
+```
+media/{owner_id}/processed/{asset_id}/
+├── hls/
+│   ├── master.m3u8
+│   ├── 1080p/index.m3u8 + seg_*.m4s
+│   ├── 720p/index.m3u8 + seg_*.m4s
+│   ├── 480p/index.m3u8 + seg_*.m4s
+│   ├── 360p/index.m3u8 + seg_*.m4s
+│   └── 240p/index.m3u8 + seg_*.m4s
+└── dash/
+    ├── manifest.mpd
+    └── (shared fMP4 segments via BaseURL)
+```
+
+### Database Schema Extensions
+```sql
+-- Add to variants.video
+ALTER TABLE variants.video ADD COLUMN IF NOT EXISTS manifest_url TEXT;
+ALTER TABLE variants.video ADD COLUMN IF NOT EXISTS abr_ladder JSONB;  -- stores ladder config used
+ALTER TABLE variants.video ADD COLUMN IF NOT EXISTS segment_duration INT DEFAULT 6;
+ALTER TABLE variants.video ADD COLUMN IF NOT EXISTS codec_profile TEXT; -- baseline/main/high
+```
+
+## Consequences
+
+### Positive
+- 3-5x faster than sequential encodes (single decode, single filtergraph)
+- True ABR playback on all HLS/DASH clients
+- CMAF single-storage for both HLS and DASH
+- Content-hash deduplication works across ABR variants
+
+### Negative
+- Requires GPU instances for cost-effective HD+ encoding
+- Manifest generation adds eventual consistency (segments appear before manifest updates)
+- Higher storage: ~2.5× single 720p
+
+### Risks
+- ffmpeg filter_complex syntax is brittle; need integration tests per codec/hwaccel combo
+- HW encoder availability varies by cloud provider/instance type
+
+## Implementation Plan
+
+| Phase | Task | Files |
+|-------|------|-------|
+| P0 | Single-pass ABR filtergraph generator | `worker/processing/abr.py` |
+| P0 | HLS/DASH manifest writer (fMP4 segments) | `worker/processing/manifests.py` |
+| P0 | HW acceleration detection + fallback | `worker/processing/hwaccel.py` |
+| P0 | Integrate into `process_video_file` | `worker/processing/videos.py` |
+| P1 | Per-title encoding (convex hull analysis) | `worker/processing/per_title.py` |
+| P1 | Cost tracking per asset (CPU-sec, GB-egress) | `internal/metrics/cost.go` |
+| P2 | DRM integration (Widevine/FairPlay/PlayReady) | `worker/processing/drm.py` |
+| P2 | Thumbnail sprites + WebVTT for scrubbing | `worker/processing/sprites.py` |
+
+## Related ADRs
+- ADR 0001: Ingress Outbox & Idempotent Consumer (`ingress-outbox-and-idempotent-consumer.md`)
+- ADR 0002: Reliability & Correctness (`reliability-and-correctness.md`)
\ No newline at end of file
diff --git a/docs/arch/image-pipeline-advancements.md b/docs/arch/image-pipeline-advancements.md
new file mode 100644
index 0000000..cb16a54
--- /dev/null
+++ b/docs/arch/image-pipeline-advancements.md
@@ -0,0 +1,178 @@
+# ADR 0004: Image Pipeline Advancements
+
+## Status
+Proposed
+
+## Context
+Current pipeline (`worker/processing/images.py`) generates 3 fixed WebP variants (thumbnail 256², display_small 512w, display_large 1280w). This wastes bandwidth on simple images, upscales small images, and lacks modern formats (AVIF, JPEG XL).
+
+## Decision
+
+### Phase P0 — Immediate Wins (2-3 days)
+
+#### 1. Modern Format Stack
+Encode each variant as AVIF → JPEG XL → WebP → JPEG (progressive), serve via `Accept` header negotiation.
+
+| Format | MIME | Pillow/lib | Quality/Speed | Browser Support |
+|--------|------|------------|---------------|-----------------|
+| AVIF   | image/avif | pillow-avif-plugin / libavif | q=50, speed=6 | Chrome 85+, Firefox 93+, Safari 16.4+ |
+| JPEG XL| image/jxl | libjxl / pillow-jxl | q=50, effort=7 | Chrome 110+ (flag), Safari 17+ (flag) |
+| WebP   | image/webp | Pillow (built-in) | q=75, method=6 | Universal |
+| JPEG   | image/jpeg | Pillow (built-in) | q=82, progressive, optimize | Universal |
+
+#### 2. Adaptive Variant Ladder (Per-Image)
+Replace fixed widths with content-aware targets:
+
+```python
+def compute_adaptive_variants(src_width: int, src_height: int, mime: str) -> list[dict]:
+    aspect = src_width / src_height
+    mpx = (src_width * src_height) / 1_000_000
+    
+    # Target megapixels per role (no upscale > 1.2x)
+    targets = [0.07, 0.3, 1.0, 3.0]  # thumb, small, medium, large
+    variants = []
+    
+    for i, tgt in enumerate(targets):
+        if tgt > mpx * 1.2:
+            break
+        w = int((tgt * 1_000_000 * aspect) ** 0.5)
+        h = int(w / aspect)
+        variants.append({
+            "role": ["thumbnail", "small", "medium", "large"][i],
+            "width": w, "height": h,
+            "format": "avif",  # primary; others generated as fallbacks
+            "quality": 55 + i * 5,
+        })
+    return variants
+```
+
+#### 3. Parallel Variant Generation
+```python
+from concurrent.futures import ThreadPoolExecutor
+
+def process_image_file_parallel(asset_id, owner_id, path, content_hash, pg_pool, storage, cfg):
+    with Image.open(path) as img:
+        variants = compute_adaptive_variants(img.width, img.height, mime)
+        
+        with ThreadPoolExecutor(max_workers=cfg.image_workers) as pool:
+            futures = {
+                pool.submit(encode_and_upload, img, v, asset_id, owner_id, storage, pg_pool): v
+                for v in variants
+            }
+            for fut in as_completed(futures):
+                fut.result()  # propagate exceptions
+```
+
+### Phase P1 — Smart Operations (1-2 weeks)
+
+#### 4. Smart Crop (Saliency / Focal Point)
+- **Option A**: OpenCV spectral residual saliency (fast, no model)
+- **Option B**: Client-provided focal point `{x: 0.5, y: 0.3}` in upload metadata
+- Apply to `thumbnail` and `small` variants (center-crop → content-aware crop)
+
+#### 5. Animated Image Support
+- Detect GIF/WebP/APNG animation via `Image.is_animated`
+- Re-encode as animated WebP (method=6, lossless=false) → 70-80% size reduction vs GIF
+- Extract first frame as static poster (WebP/AVIF)
+- Store frame count, loop count, duration in `variants.image` metadata
+
+#### 6. Color Space Normalization
+- `ImageOps.exif_transpose()` for orientation
+- Convert to sRGB ICC profile; preserve Display P3 if source has it
+- Optional: soft-proof for sRGB gamut mapping
+
+### Phase P2 — Frontend Integration & Quality Assurance (1 week)
+
+#### 7. Responsive HTML Generator
+```python
+def generate_picture_html(variants: list[dict], base_url: str, alt: str = "") -> str:
+    """Generate <picture> with format fallbacks + srcset."""
+    by_role = {}
+    for v in variants:
+        by_role.setdefault(v["role"], []).append(v)
+    
+    sources = []
+    for role, fmts in by_role.items():
+        for fmt in fmts:
+            sources.append(
+                f'<source type="image/{fmt["format"]}" '
+                f'srcset="{base_url}/{role}.{fmt["format"]}" '
+                f'media="(min-width: {fmt["width"]}px)">'
+            )
+    sources.append(f'<img src="{base_url}/medium.webp" alt="{alt}" loading="lazy">')
+    return "<picture>\n" + "\n".join(sources) + "\n</picture>"
+```
+
+#### 8. Perceptual Quality Metrics
+- Integrate SSIMULACRA2 (Google) or Butteraugli for automated QA
+- Fail build if variant quality drops below threshold
+- Store metric scores in `variants.image.quality_score`
+
+## Database Schema Extensions
+
+```sql
+ALTER TABLE variants.image ADD COLUMN IF NOT EXISTS format_set TEXT[];  -- ['avif','webp','jpeg']
+ALTER TABLE variants.image ADD COLUMN IF NOT EXISTS quality_score REAL; -- SSIMULACRA2 score
+ALTER TABLE variants.image ADD COLUMN IF NOT EXISTS is_animated BOOLEAN DEFAULT FALSE;
+ALTER TABLE variants.image ADD COLUMN IF NOT EXISTS frame_count INT;
+ALTER TABLE variants.image ADD COLUMN IF NOT EXISTS focal_point_x REAL;
+ALTER TABLE variants.image ADD COLUMN IF NOT EXISTS focal_point_y REAL;
+```
+
+## Storage Key Structure (Backward Compatible)
+
+```
+media/{owner_id}/processed/{asset_id}/
+├── thumbnail.avif
+├── thumbnail.webp
+├── thumbnail.jpg
+├── small.avif
+├── small.webp
+├── small.jpg
+├── medium.avif
+├── medium.webp
+├── medium.jpg
+├── large.avif
+├── large.webp
+├── large.jpg
+├── poster.avif          -- for animated sources
+├── poster.webp
+└── animation.webp       -- animated WebP (replaces GIF)
+```
+
+## Consequences
+
+### Positive
+- 30-50% bandwidth reduction (AVIF vs WebP)
+- No wasted upscales (adaptive ladder)
+- 3-4x faster processing (parallel Pillow)
+- Animated GIF → WebP: 80% size reduction
+- Drop-in `<picture>` HTML for frontend
+
+### Negative
+- AVIF encoding slower (~3x WebP); mitigate with `speed=6` + parallel workers
+- JPEG XL browser support still behind flags
+- More storage variants (4 formats × 4 roles = 16 files vs 3)
+- OpenCV dependency for saliency (optional)
+
+### Risks
+- Pillow AVIF plugin requires `libavif` system package
+- ThreadPoolExecutor GIL contention on CPU-heavy Pillow ops (mitigated: C extensions release GIL)
+- Accept header parsing complexity at CDN edge
+
+## Implementation Plan
+
+| Phase | Task | Files | Effort |
+|-------|------|-------|--------|
+| P0 | AVIF/WebP/JXL encoder + format negotiation | `worker/processing/formats.py` | 1 day |
+| P0 | Adaptive variant ladder | `worker/processing/adaptive_images.py` | 1 day |
+| P0 | Parallel processing + integrate | `worker/processing/images.py` | 0.5 day |
+| P1 | Smart crop (saliency/focal) | `worker/processing/smart_crop.py` | 2 days |
+| P1 | Animated image support | `worker/processing/animated.py` | 1 day |
+| P1 | Color space normalization | `worker/processing/color.py` | 1 day |
+| P2 | `<picture>` HTML generator | `worker/processing/responsive.py` | 0.5 day |
+| P2 | SSIMULACRA2 quality gate | `worker/processing/quality.py` | 2 days |
+
+## Related ADRs
+- ADR 0003: ABR Transcoding Pipeline (`abr-transcoding-pipeline.md`)
+- ADR 0001: Ingress Outbox & Idempotent Consumer (`ingress-outbox-and-idempotent-consumer.md`)
\ No newline at end of file
diff --git a/docs/enhancements/README.md b/docs/enhancements/README.md
new file mode 100644
index 0000000..015a1c8
--- /dev/null
+++ b/docs/enhancements/README.md
@@ -0,0 +1,219 @@
+# MPiper Enhancements — Roadmap
+
+This directory tracks the work that takes MPiper from a well-built side project
+to a production-grade media platform. Each **track** is chosen to teach a
+distinct, transferable systems-engineering concept *and* to add real product
+value — not feature-padding.
+
+The philosophy: **write a design doc per track before coding** (problem, options,
+decision, tradeoffs, how success is measured), and pair every track with a
+load test or chaos experiment so each claim ("now it scales", "now it's
+exactly-once") is *demonstrated*, not assumed.
+
+> **Progress:** Track 3 (observability + load testing) ✅, **Track 1 (concurrent
+> worker + XAUTOCLAIM recovery + DLQ) ✅**, and **Track 1b (webhook delivery
+> throughput) ✅** are done. Track 3 shipped the foundation that makes everything
+> measurable — tracing, SLOs, dashboards, a k6 harness. Track 1 then turned the #1
+> bottleneck into a **measured 2.37× worker throughput win** (0.73 → 1.73 jobs/s,
+> 1 → ~3.2 cores; see [`experiments/0002`](../../experiments/0002-concurrent-worker.md)),
+> and Track 1b wired the webhook delivery metrics + concurrent fan-out (see
+> [`experiments/0003`](../../experiments/0003-webhook-throughput.md)). **Next: Track 2
+> (autoscaling)** — now unblocked, since there is finally a concurrent worker to scale.
+
+## Where we are today
+
+A clean, correct, **single-tenant, best-effort, single-node-throughput** pipeline
+with good bones — now fully observable:
+
+- Transactional enqueue via an **outbox relay** (Postgres → Redis Streams).
+- An **idempotent-ish consumer** with content-hash dedup.
+- **Presigned uploads** with a split internal/public storage endpoint.
+- **Webhooks** with HMAC signing + exponential backoff.
+- **End-to-end distributed tracing** (API → outbox → Redis → worker → ffmpeg, one
+  waterfall per asset), **OTel metrics** on API and worker, **SLO recording rules**,
+  provisioned **Grafana dashboards**, and a host-run **k6 load harness** — on the
+  bundled Grafana/Tempo/Loki/Prometheus stack.
+
+Known seams where "side project" becomes "system" (verified in code, and now
+several of them **measured** under load):
+
+- **~~Single-threaded worker~~ ✅ RESOLVED (Track 1)** — now a bounded
+  `ThreadPoolExecutor` honouring `MAX_CONCURRENT_JOBS`. *Measured:* μ rose from
+  ~1.1 → **1.73 jobs/s (2.37×)** and worker CPU from 1 → ~3.2 cores at `mcj=4`.
+  Lesson banked: tune `MAX_CONCURRENT_JOBS ≈ cores` — `mcj=8` on 4 cores
+  *oversubscribed* and was slower.
+- **~~Webhook dispatcher can't keep up~~ ✅ RESOLVED (Track 1b)** — concurrent
+  `errgroup` fan-out (`WEBHOOK_CONCURRENCY`) + tuned HTTP transport, and the
+  previously-unrecorded `webhook_delivery_*` metrics are now wired. *Note:* at
+  local scale (fast receiver, CPU-pinned API) the dispatcher kept up even
+  serially, so the win here is observability + headroom; see
+  [`experiments/0003`](../../experiments/0003-webhook-throughput.md).
+- **~~Homegrown recovery~~ ✅ RESOLVED (Track 1)** — replaced the DB-scan + re-`XADD`
+  with `XAUTOCLAIM` consumer-group recovery, and added a **dead-letter stream**
+  (`media:jobs:dlq`) with failure metadata + a depth gauge for poison/over-retried
+  messages (previously dropped/unacked-forever).
+- **No raw-upload lifecycle** — objects in `media/raw/` are never deleted.
+  *Measured:* ~**50%** of presigned uploads are never completed → orphaned objects
+  accumulate.
+- **Homegrown auth** — an AES-GCM token with no expiry/rotation, and the same
+  `ENCRYPTION_KEY` signs both auth tokens and webhook secrets.
+- **Polled high-churn tables** (`jobs`, `event_outbox`, `webhook_deliveries`), grown
+  unbounded with cleanup-by-retention only. *Measured:* `event_outbox` kept up with
+  **0 backlog** and the DB had headroom (**18 ms** mean query, **5/25** connections);
+  only `webhook_deliveries` actually strained.
+
+## What the first load test proved (exp 0001)
+
+Track 3 gave us the instrumentation to stop guessing. The first saturating run
+(`open --rate 10/s`, CPU-pinned worker) turned the seams above into a **measured,
+ranked** list — and every track below now has a baseline to beat by re-running the
+*same* k6 profile and comparing the dashboards.
+
+| Finding (measured) | What it means | Owner |
+|---|---|---|
+| Worker μ ≈ **1.1 jobs/s**, CPU 98%, queue → **2,544** | Single-threaded worker is the throughput ceiling | **Track 1 (P0)** |
+| `webhook_pending` → **5,901**, never drains | Dispatcher delivery rate ≪ insertion rate | **Track 1b (P1, new)** |
+| `event_outbox` **0 backlog**; DB **18 ms** mean, **5/25** conns | Outbox + DB have large headroom *today* | Track 7 → **defer** |
+| `webhook_deliveries` is the one polled table straining | The real, current trigger for data-layer work | Track 7 → **rescope to this** |
+| **~50%** of presigns never completed → orphaned `media/raw/` | Storage grows with abandoned uploads | Track 5 (small) |
+| `/complete` p99 **358 ms** (synchronous MinIO HEAD) | Minor hot-path tail | Track 5 (small) |
+
+Net effect: **Track 1 is confirmed P0**, a **webhook-throughput bottleneck was
+surfaced that no track owned** (now Track 1b), and **Track 7's table-partitioning is
+premature** — the DB isn't the problem yet; the webhook *delivery loop* is.
+
+## Tracks
+
+| # | Track | Core systems lesson | Status |
+|---|-------|---------------------|--------|
+| 1 | [Concurrent worker + proper stream recovery + DLQ](track-01-concurrent-worker.md) | Concurrency models, at-least-once recovery, poison-message handling, head-of-line blocking | **done ✅ (2.37× — exp 0002)** |
+| 1b | Webhook delivery throughput *(surfaced by exp 0001)* | Concurrent I/O-bound delivery, backpressure on a side-channel, decoupling fan-out from job completion | **done ✅ (exp 0003)** |
+| 2 | [Queue-depth autoscaling](track-02-handoff.md) | Backpressure, control loops, Little's Law, SLO-driven capacity | **ready — needs k8s (deferred until a cluster is available)** |
+| 3 | [End-to-end tracing, SLOs & local load testing](track-03-observability-and-load.md) | Context propagation across async boundaries, the three pillars, SLO/SLI/error budgets, load-test methodology | **done ✅** |
+| 4 | [Multi-tenancy, auth & quotas](track-04-handoff.md) | AuthN vs AuthZ, key rotation, the idempotency pattern, tenant isolation | planned |
+| 5 | [Production ingestion pipeline](track-05-ingestion.md) | Resumable/multipart uploads, pipeline stages, defense-in-depth, trust boundaries | planned |
+| 6 | [Adaptive streaming + CDN](track-06-adaptive-streaming.md) | ABR streaming, CDN cache/invalidation, edge auth, encoding cost/quality tradeoffs | planned |
+| 7 | [Data layer at scale](track-07-data-layer.md) | Table partitioning, CDC vs polling, index design under write load | **deferred — rescope to `webhook_deliveries`** |
+| 8 | [Resilience & correctness verification](track-08-resilience.md) | Failure-mode analysis, exactly-once in practice, replay attacks, chaos engineering | planned |
+
+> Track 3 is the only track with a full design doc checked in, because it was built
+> first. Now that it's done, every track below is **measurable**: implement, re-run
+> the same k6 profile, compare dashboards, and record an `experiments/NNNN-*.md`
+> writeup. "It scales" is a claim we can prove, not assert.
+
+## Recommended sequence (re-prioritized from exp 0001 data)
+
+1. **~~Track 1 — concurrent worker + DLQ + stream recovery.~~ ✅ DONE.**
+   Was the P0 throughput ceiling (μ ≈ 1.1 jobs/s). Now a bounded pool: **measured
+   2.37×** (0.73 → 1.73 jobs/s), multi-core, 100% success, live DLQ. See
+   `experiments/0002`. Lesson: set `MAX_CONCURRENT_JOBS ≈ cores`.
+2. **~~Track 1b — webhook delivery throughput.~~ ✅ DONE.** Concurrent fan-out +
+   wired `webhook_delivery_*` metrics + transport tuning. Not the bottleneck at
+   local scale (kept up serially), so the win is observability + headroom; see
+   `experiments/0003`. To prove it under stress, re-run with a latency-bearing
+   receiver.
+3. **Track 2 — autoscaling. ← NEXT.** Now unblocked: there is finally a concurrent
+   worker to scale. Drive worker replica count off the queue-lag signal we already
+   expose (KEDA; k8s manifests exist). *Verify:* a backlog → scale-up → drain cycle.
+   Carry the Track 1 lesson forward — each replica runs its own pool, so set
+   `MAX_CONCURRENT_JOBS ≈ cores-per-pod` and scale *pods*, not threads. See
+   [`track-02-handoff.md`](track-02-handoff.md).
+4. **Track 4 — multi-tenancy + idempotency + auth.** The leap to "real users".
+5. **Track 6 — adaptive streaming + CDN.** The headline product feature.
+6. **Track 5 — ingestion.** Includes the small wins exp 0001 surfaced: abandoned-upload
+   lifecycle (~50% orphaned `media/raw/`) and the `/complete` MinIO-HEAD tail.
+7. **Track 7 — data layer.** **Deferred and rescoped.** DB/outbox have headroom today;
+   revisit when volume justifies, scoped first to `webhook_deliveries` churn (the one
+   polled table that actually strained) rather than blanket partitioning.
+8. **Track 8 — resilience & correctness.** Depth once the throughput tracks land.
+
+> **~~Track 3 follow-ups~~ ✅ DONE** (folded into Track 1b): `webhook_delivery_*`
+> metrics wired, `db.query.duration` fine-bucket view added, `storage_operation_*`
+> confirmed already recorded. Histogram-bucket standardization remains a watch-item
+> when reading p95 across a deploy window.
+
+---
+
+## Track catalog (summaries)
+
+### Track 1 — Concurrent worker + proper stream recovery + DLQ
+**Gap:** one job at a time; a 3s video blocks a 200ms thumbnail. Recovery scans
+Postgres and re-`XADD`s instead of using consumer-group delivery state.
+**Move:** bounded worker pool (process pool for CPU-bound ffmpeg/Pillow vs async
+for I/O — *choosing which is the lesson*); honour `MAX_CONCURRENT_JOBS` as a
+semaphore; `XAUTOCLAIM`/`XPENDING` to reclaim dead-consumer messages; a
+**dead-letter stream** for messages past the attempt cap; priority lanes so small
+jobs don't queue behind large transcodes.
+**Teaches:** thread vs process vs async, the GIL, CPU vs I/O bound, at-least-once
+recovery, poison-message handling, head-of-line blocking.
+
+### Track 1b — Webhook delivery throughput *(surfaced by exp 0001)*
+**Gap:** the dispatcher polls every 2s, batch 50, and delivers webhooks with
+*synchronous* HTTP + retries on a single loop. Each job emits 3 events
+(`job.starting/started/done`), so insertion rate ≫ delivery rate — the load test
+drove `webhook_pending` to ~5,900 with no recovery. Delivery is also under-
+instrumented: `webhook_delivery_total/duration/failures` are defined but never
+recorded, so only the `pending` gauge revealed the backlog.
+**Move:** a bounded pool of concurrent delivery workers (I/O-bound → async/threads
+fits); decouple fan-out from job completion; wire the delivery metrics + a
+delivery-latency SLI. Optionally move webhook rows onto their own stream consumer
+rather than a DB poll.
+**Teaches:** concurrency for I/O-bound work, backpressure on a side-channel,
+decoupling producers from slow consumers, instrumenting before optimizing.
+
+### Track 2 — Queue-depth autoscaling
+**Gap:** static worker count; bursts grow latency unbounded, idle wastes capacity.
+**Move:** expose stream lag + oldest-message-age (extend the existing relay-lag
+metric); drive **KEDA** (k8s manifests already exist) to scale workers on lag;
+load-test the backlog → scale → drain cycle.
+**Teaches:** backpressure, control loops, latency- vs queue-depth-based scaling,
+Little's Law (L = λW), capacity planning.
+
+### Track 4 — Multi-tenancy, auth & quotas
+**Gap:** homegrown AES token (no expiry/rotation, shared key with webhook secrets);
+single bucket, path-prefixed; no idempotency keys (retried presign = duplicate asset).
+**Move:** OIDC/JWT (asymmetric keys, expiry, JWKS rotation) or scoped API keys;
+separate webhook-signing secret; org→project→asset model with repository-layer
+row scoping and per-tenant storage prefixes/credentials; **idempotency keys** on
+`presign`/`complete`; per-tenant **quotas + rate limits** with usage accounting.
+**Teaches:** authN vs authZ, key management/rotation, the idempotency pattern,
+tenant isolation, security blast-radius.
+
+### Track 5 — Production ingestion pipeline
+**Gap:** single presigned `PUT`, 500MB cap, no resumability, MIME-only validation,
+no scanning. Plus (from exp 0001) **no lifecycle for abandoned uploads** — ~50% of
+presigns never complete, orphaning `media/raw/` objects.
+**Move:** S3 **multipart/resumable** uploads with part-level retry; a validation
+stage with real content sniffing (`python-magic` is already a dep); optional
+**ClamAV** malware scan as a stage; dedup *before* full download via verified
+client-supplied hash; a TTL/lifecycle sweep for un-completed raw uploads.
+**Teaches:** large-file transfer, pipeline/stage design, defense-in-depth, trust
+boundaries (never trust client content-type).
+
+### Track 6 — Adaptive streaming + CDN
+**Gap:** one fixed 720p MP4 at hardcoded 2500kbps, served straight from MinIO.
+**Move:** generate an **HLS/DASH adaptive ladder** (multiple renditions + manifest —
+`variants.video.manifest_url` already exists in the schema); serve via **CDN** with
+signed URLs + cache-control; content-aware/per-title encoding decisions.
+**Teaches:** adaptive bitrate streaming, CDN cache strategy + invalidation, edge
+signed-URL access control, encoding cost/quality tradeoffs.
+
+### Track 7 — Data layer at scale *(deferred — see exp 0001)*
+**Gap:** `jobs`, `event_outbox`, `webhook_deliveries` polled and growing. The load
+test showed the DB and outbox have **headroom today** (18 ms mean query, 5/25
+connections, 0 outbox backlog), so blanket partitioning is premature — but
+`webhook_deliveries` is the one table that genuinely strained.
+**Move:** start narrow — partition/clean `webhook_deliveries` and replace its poll
+with `LISTEN/NOTIFY` or a stream consumer (overlaps Track 1b). Broaden to the other
+tables (monthly partitions; drop instead of DELETE; read replicas; CDC) only when
+volume justifies it.
+**Teaches:** partitioning, CDC vs polling, write-heavy index design, pool sizing.
+
+### Track 8 — Resilience & correctness verification
+**Gap:** unit + integration tests exist, but no proof of survival under failure/load.
+**Move:** **fault injection / chaos** (kill the worker mid-transcode, pause Redis,
+fill the disk — verify processed-once holds); **load tests** with latency budgets in
+CI; **webhook contract tests** + replay protection (sign a timestamp, reject stale
+deliveries — today a captured payload replays forever).
+**Teaches:** failure-mode analysis, exactly-once vs at-least-once in practice,
+replay attacks, reliability as a tested property.
diff --git a/docs/enhancements/track-01-handoff.md b/docs/enhancements/track-01-handoff.md
new file mode 100644
index 0000000..7b51d4e
--- /dev/null
+++ b/docs/enhancements/track-01-handoff.md
@@ -0,0 +1,330 @@
+# Track 1 + 1b — Session Handoff (start here)
+
+**Purpose:** everything a fresh conversation needs to begin **Track 1 (concurrent
+worker + stream recovery + DLQ)** and **Track 1b (webhook delivery throughput)**
+without prior context. Read this top to bottom, then open
+`track-01-concurrent-worker.md` for the full design. This doc is the *operational*
+companion: where things are, how to run them, what the baseline is, and the
+landmines already discovered. It assumes **Track 3 is done** — tracing, SLOs,
+dashboards, and the k6 harness all exist, so every change here is measurable.
+
+---
+
+## 1. What MPiper is (60-second orientation)
+
+A media-processing pipeline: a **Go API** (`cmd/server`, `internal/`) accepts
+uploads and a **Python worker** (`worker/`) processes them. They communicate over
+**Redis Streams** (`media:jobs`, group `worker-group`). **Postgres** is the
+durable source of truth; **MinIO** (S3-compatible) stores objects. Webhooks notify
+clients of job lifecycle events.
+
+**Asset flow:**
+`POST /api/v1/storage/presign` → client `PUT`s file to MinIO →
+`GET /api/v1/assets/{id}/complete` (writes asset `uploaded` + job + outbox row +
+`job.starting` webhook rows in one tx) → **outbox relay** (1s poll) publishes to
+Redis → **worker** consumes → image (3 webp variants) or video (poster + 720p +
+preview) → variants written to MinIO + Postgres, asset `ready` → worker inserts
+`job.started`/`job.done` webhook rows → **webhook dispatcher** (2s poll) delivers
+signed POSTs.
+
+---
+
+## 2. The goals in one sentence each
+
+- **Track 1:** make the worker process **N jobs concurrently** (it does 1 today),
+  recover dead-consumer messages with **Redis Streams' own `XPENDING`/`XAUTOCLAIM`**
+  instead of a DB scan, and route poison messages to a **dead-letter stream** — so
+  the worker's service rate scales with cores and a single bad/large job can't stall
+  the pipeline.
+- **Track 1b:** make the **webhook dispatcher deliver concurrently** (it delivers
+  serially today) and **wire its delivery metrics**, so `webhook_pending` drains
+  instead of growing unboundedly.
+
+Both are throughput fixes for the two bottlenecks the Track 3 load test proved.
+
+---
+
+## 3. The baseline to beat (exp 0001, verified)
+
+From `experiments/0001-worker-saturation.md` (open model, `--rate 10/s`, worker
+pinned to 1 CPU). Re-run the **same** profile after each track and compare.
+
+| Signal | Baseline | Target after the track |
+|---|---|---|
+| Worker service rate μ | **~1.1 jobs/s** | scales ~N× with the pool (until CPU-bound) |
+| Worker CPU | **98%** (1 core, pegged) | utilizes all allotted cores |
+| Peak queue depth | **2,544 and growing** | stabilizes (drains at λ ≤ μ) |
+| Asset proc p50 / mean | **0.86 s / 1.76 s** | unchanged per-job; throughput is the win |
+| `webhook_pending` peak | **~5,901, never drains** | drains to ~0 (Track 1b) |
+| Job success rate | 100% | stays 100% (no double-processing) |
+| API presign p95 / complete p99 | 48 ms / 358 ms | unaffected (API isn't the bottleneck) |
+| DB | 18 ms mean, 5/25 conns, 0 waits | watch pool as worker concurrency rises |
+
+> **Watch the DB pool** as you add worker concurrency: N concurrent jobs × the
+> per-job DB calls will raise in-use connections. The new `mpiper_db_connections_*`
+> gauges (added during Track 3 follow-up) will show it.
+
+---
+
+## 4. Exact engineering targets — Track 1 (worker)
+
+Verify each before editing.
+
+**The single-threaded loop:**
+- `worker/consumer/main.py` `main()` — the loop is `while not shutdown: processed =
+  consumer.consume(stream); if not processed: sleep(job_poll_interval)`. One message
+  at a time, inline.
+- `worker/consumer/consumer.py` `consume()` — `xreadgroup(..., count=1, block=5000)`,
+  then dispatches inline via `_handle_job` / `_handle_asset_message`.
+- `worker/consumer/config.py` — `max_concurrent_jobs` (`MAX_CONCURRENT_JOBS`, default
+  5) **exists but is never used**. This is the semaphore size to honour.
+
+**Concurrency model (this choice *is* the lesson):**
+- Work is **CPU-bound**: Pillow (`images.py`) and ffmpeg (`videos.py`, via
+  `subprocess`). ffmpeg runs in a separate process (true parallelism regardless of
+  the GIL); Pillow releases the GIL for most ops. So a **thread pool** may suffice,
+  but a **process pool** gives guaranteed parallelism for the Python-side work.
+  Decide and document the tradeoff (GIL, memory, startup cost, picklability).
+- Read `count=N` (or keep `count=1` and dispatch to a bounded pool); cap in-flight at
+  `MAX_CONCURRENT_JOBS`.
+
+**Invariants that MUST survive concurrency:**
+- **Per-message ack.** Today `consume()` acks after the single job. With a pool,
+  track `msg_id` per task and `XACK` only that message on its own success; leave
+  failed ones unacked (they stay in the PEL for recovery).
+- **Idempotent claim.** `_handle_job` claims a job with `SELECT ... FOR UPDATE` and
+  checks `status == 'done'`. Concurrent consumers must each claim distinct rows;
+  don't weaken the row lock. Content-hash dedup (`check_for_duplicate`) also guards
+  double work.
+- **Asset state ownership.** `_handle_job` (not the processor) owns the
+  `failed`/`ready` transition — preserve that (see DEV-34 comment).
+- **Tracing.** The `worker.consume` span + pipeline spans must be started **inside
+  each task**, carrying that message's extracted `traceparent` (see
+  `_consume_span`). Don't share one span across concurrent jobs or the Tempo
+  waterfalls will merge.
+- **Per-job metrics.** `wm.record_job` / `wm.record_asset` are already called; keep
+  them per-task (asset_type label only — never asset_id on a metric).
+
+**Recovery — replace the homegrown scan:**
+- `consumer.py` `_recover_stuck_pending()` does a DB scan
+  (`status IN ('pending','in_progress') AND updated_at < now() - interval '2 minutes'`)
+  and re-`XADD`s. Replace with **`XAUTOCLAIM`** (or `XPENDING` + `XCLAIM`) on
+  `media:jobs` / `worker-group` to reclaim messages idle past a threshold from dead
+  consumers — using the stream's own delivery state. Keep it time-gated like
+  `_maybe_recover()`.
+
+**Dead-letter queue:**
+- Today poison messages are marked `failed` and the Redis message is dropped (acked
+  or abandoned). Add a **dead-letter stream** (e.g. `media:jobs:dlq`): when a job
+  exceeds `cfg.redis.max_retries`, `XADD` the message (with failure metadata) to the
+  DLQ and `XACK` the original, instead of silently dropping. Lets you inspect/replay.
+
+**Head-of-line blocking (optional, in the design):**
+- A 60s video blocks short thumbnails behind it. Consider **priority lanes** (e.g.
+  separate streams or a priority field) so small jobs don't queue behind large
+  transcodes.
+
+---
+
+## 5. Exact engineering targets — Track 1b (webhook dispatcher)
+
+**The serial delivery loop:**
+- `internal/webhook/dispatcher.go` `tick()` fetches a batch with
+  `... FOR UPDATE OF wd SKIP LOCKED LIMIT $BatchSize`, then **delivers them one at a
+  time** in `for _, row := range rows { d.deliver(ctx, row) }`. Each `deliver()` is a
+  synchronous HTTP POST with `d.client.Timeout`. **This serial loop is the
+  bottleneck.**
+- Config: `WEBHOOK_POLL_INTERVAL` (2s), `WEBHOOK_BATCH_SIZE` (50), `WEBHOOK_TIMEOUT`
+  (10s), `WEBHOOK_MAX_ATTEMPTS` (5) — in `internal/config/env.go`.
+
+**The move:**
+- Deliver the batch **concurrently** with a bounded pool (e.g. `errgroup` +
+  semaphore, or a worker-pool of size `WEBHOOK_CONCURRENCY`). HMAC signing, backoff
+  (`next_attempt_at`), and retry logic in `handleFailure`/`backoff` stay as-is.
+- **Wire the metrics.** `WebhookDeliveryTotal`, `WebhookDeliveryDuration`,
+  `WebhookDeliveryFailures` are **defined in `internal/metrics/metrics.go` but never
+  recorded** in the dispatcher. `NewDispatcher(db, logger, cfg)` doesn't take
+  `*metrics.Metrics` — extend it to accept `m`, record per delivery (labels:
+  `event`, `status` — **never** asset_id), and pass `m` from `cmd/server/main.go`
+  (where the dispatcher is constructed).
+- The SLI rule `sli:webhook_delivery_latency_seconds:p95` already exists; it just
+  needs the histogram to be recorded.
+
+**Concurrency-safety note:** `tick()` runs `SELECT ... FOR UPDATE SKIP LOCKED`
+**outside an explicit transaction** (`d.db.SelectContext`), so the row locks are
+released as soon as the SELECT returns — fine for one dispatcher with internal
+goroutines, but it does **not** prevent two *separate* dispatcher processes from
+grabbing the same row. If you ever run >1 dispatcher, wrap the claim in a tx or add a
+`claimed_at`/`locked_by` column. Document whichever you choose.
+
+---
+
+## 5b. Track 3 follow-ups to fold in (do these first; ~30 min)
+
+These were flagged in `experiments/0001` and the roadmap; doing them first means the
+0002/0003 experiments have clean, artifact-free numbers:
+
+- **Wire `webhook_delivery_*` metrics** (part of Track 1b above).
+- **Wire `storage_operation_*` metrics** (the `pkg/utils/storagex` layer doesn't
+  record them; the `/complete` MinIO-HEAD cost is currently invisible).
+- **Add a fine-bucket view for `db.query.duration`** in `internal/metrics/metrics.go`
+  (it uses default coarse buckets, so its p95 reads ~4.75 s — an artifact; true mean
+  is 18 ms). Mirror the existing `http`/`queue.processing.lag` views.
+- **Standardize histogram buckets** across worker/API so p95s aren't distorted when
+  old + new bucket boundaries mix in one query window (this bit the image-ready and
+  enqueue-lag SLIs).
+
+---
+
+## 6. Environment & topology (host = macOS)
+
+**Host ports → containers:**
+| Service | Host | Notes |
+|---|---|---|
+| API | 5010 | `/healthz`, `/api/v1/...` |
+| Postgres | 5433 | user `mpiper`, db `mpiper`, pw `changeme` |
+| Redis | 6380 | stream `media:jobs`, group `worker-group` |
+| MinIO API / console | 9000 / 9001 | bucket `mpiper`, minioadmin/minioadmin |
+| Grafana | 3000 | anon admin; folder **MPiper** |
+| Prometheus | 9090 | remote-write receiver enabled (for k6) |
+| Tempo | 3200 | pinned `grafana/tempo:2.6.1` |
+| OTel Collector | 8888/8889 | metrics; bridges `mpiper_net` ↔ `mpiper_obs_net` |
+
+**Container names:** `mpiper-api`, `mpiper-worker`, `mpiper-postgres`,
+`mpiper-redis`, `mpiper-minio`, `mpiper-otel-collector`, `mpiper-tempo`,
+`mpiper-prometheus`, `mpiper-grafana`, `mpiper-loki`, `mpiper-promtail`.
+
+**Compose overlays:** `docker-compose.yml` (core) + `docker-compose.observability.yml`
+(Tempo/Prom/Loki/Grafana/collector) + `docker-compose.loadtest.yml` (CPU/mem pins +
+`TRACE_SAMPLING_RATE=1.0`). `ENCRYPTION_KEY=0123456789abcdef0123456789abcdef`.
+
+**Metric naming (important):** the collector's Prometheus exporter uses
+`namespace: mpiper`. Go API instruments → `mpiper_http_server_request_duration_seconds_*`;
+**worker instruments already carry a `mpiper.` prefix → double prefix**
+`mpiper_mpiper_job_processing_success_total`, etc. k6 client metrics land under
+`k6_*` (custom ones as `k6_mpiper_*`).
+
+---
+
+## 7. Runbook / command cheat sheet
+
+```bash
+# Bring up core + observability + loadtest pins (everything, rebuild):
+docker compose -f docker-compose.yml -f docker-compose.observability.yml \
+  -f docker-compose.loadtest.yml up -d --build
+
+# Rebuild just api/worker after code changes:
+docker compose -f docker-compose.yml -f docker-compose.observability.yml \
+  -f docker-compose.loadtest.yml up -d --build api worker
+
+# Worker unit tests — the image entrypoint boots the worker, so OVERRIDE it:
+docker run --rm --entrypoint python -v "$PWD":/app -w /app mpiper-worker \
+  -m unittest discover -s worker/tests -p 'test_*.py' -v
+
+# Go: build / vet / test  (tests/performance_suite_test.go fails w/o PERF_TEST_URL — ignore)
+go build ./... && go vet ./internal/... && go test ./internal/... ./pkg/...
+
+# Load test (baseline profile to compare against exp 0001):
+./loadtest/run.sh open --rate 10/s --duration 90s        # arrival > service
+./loadtest/run.sh closed --vus 10 --duration 2m          # find max throughput
+
+# Query Prometheus history (data persists across `down` WITHOUT -v):
+#   Tempo retains traces 48h, Prometheus 30d. Instant queries only see the last
+#   5 min, so for past runs wrap in last_over_time(metric[12h]) / max_over_time.
+
+# Inspect Redis stream + consumer group / pending:
+docker exec mpiper-redis redis-cli XINFO GROUPS media:jobs
+docker exec mpiper-redis redis-cli XPENDING media:jobs worker-group
+
+# Inspect webhook backlog:
+docker exec mpiper-postgres psql -U mpiper -d mpiper -c \
+  "SELECT status, count(*) FROM webhook_deliveries GROUP BY status;"
+
+# UIs: Grafana http://localhost:3000 (Experiment Overview) · Prometheus :9090 · Tempo via Grafana Explore
+```
+
+---
+
+## 8. Landmines (already bit, or will)
+
+- **Worker tests:** the `mpiper-worker` image has an entrypoint that runs the worker;
+  you MUST `--entrypoint python` to run unittest, else it tries to boot + migrate and
+  hits the DB. The local `.venv` lacks deps — always test in the container.
+- **Tracing under concurrency:** start the `worker.consume` span (and pipeline spans)
+  *inside each task* with that message's context. Sharing context across goroutines/
+  tasks will corrupt the per-asset waterfalls. Verify in Tempo after.
+- **Ack discipline:** only `XACK` a message after *its* job succeeds. With a pool,
+  don't ack by position — ack by `msg_id`.
+- **Mixed histogram buckets:** changing bucket boundaries makes `histogram_quantile`
+  over a window that spans the change produce garbage p95s. After re-instrumenting,
+  either reset Prometheus data or wait for the old series to age out before reading.
+- **DB pool pressure:** more concurrent jobs → more in-use connections. Pool max is
+  25 (`mpiper_db_connections_max_open`). Watch `..._active` and `..._wait_count`.
+- **Webhook `SKIP LOCKED` without a tx:** safe for single-dispatcher internal
+  concurrency, NOT for multiple dispatcher processes (see §5).
+- **Operational flakiness seen this session:** an aborted `compose up` (a stray
+  `mpiper-webhook-receiver` on host :8888 collided with the collector) left
+  containers with stale port publishing (`docker port` empty) and detached the
+  collector from `mpiper_obs_net`. Fix = `up -d --force-recreate <svc>`. If telemetry
+  "disappears," check the collector is on both networks and Prometheus targets are up.
+- **k6:** no `TextEncoder` in its runtime (use charCodes); client metrics are prefixed
+  `k6_`; remote-write target is `http://localhost:9090/api/v1/write`.
+- **Dedup hides work:** the harness fans out unique bytes per iteration; keep that or
+  repeat runs do near-zero work.
+- **Don't put `asset_id` on a metric label** (high cardinality) — span attribute only.
+
+---
+
+## 9. Acceptance / how we'll know it worked
+
+- **Track 1:** re-run `open --rate 10/s` → μ rises ~N× (pool size, until CPU-bound),
+  queue depth **stabilizes/drains** instead of growing, job success stays 100% (no
+  double-processing — verify via DB job counts and dedup). A killed-mid-job consumer's
+  message is reclaimed by `XAUTOCLAIM`; a poison message lands in `media:jobs:dlq`.
+  Write `experiments/0002-concurrent-worker.md` (before/after table + a trace).
+- **Track 1b:** under the same load, `webhook_pending` **drains to ~0**; the new
+  webhook delivery-rate and p95 panels populate; `sli:webhook_delivery_latency_seconds:p95`
+  renders. Write `experiments/0003-webhook-throughput.md`.
+
+Each writeup follows the `0001` template: setup (with resource pins) → method → before
+numbers → the trace/dashboard evidence → conclusion. Local results are **relative** —
+trust deltas and bottleneck location, not absolute throughput.
+
+---
+
+## 10. Repo / git state at handoff
+
+- **Branch:** `feat/track-03-observability` (cut from `staging`), **10 commits**,
+  Track 3 work committed (tracing, worker instrumentation, log correlation, metric
+  fixes + DB pool gauges, observability infra, Grafana provisioning fix, dashboards,
+  k6 harness, `experiments/0001`).
+- **Uncommitted at handoff:** the roadmap README rewrite (`docs/enhancements/README.md`)
+  and this handoff doc — commit them at the start of the Track 1 session
+  (`docs(roadmap): mark Track 3 done, re-prioritize from exp 0001`).
+- **Not pushed yet.** Decide whether to push `feat/track-03-observability` + open a PR
+  against `staging` before branching for Track 1, or continue on the same branch.
+- **Key reads:** `experiments/0001-worker-saturation.md` (the baseline),
+  `docs/enhancements/README.md` (re-prioritized roadmap),
+  `track-01-concurrent-worker.md` (full design — write it out before coding, per the
+  per-track design-doc philosophy), and `track-03-handoff.md` (the doc that started
+  the Track 3 session, for format).
+
+---
+
+## 11. Suggested first-session scope
+
+Do the **§5b follow-ups + Track 1b first** (small, high-value, makes the next
+experiment clean), then **Track 1**:
+
+1. **Warm-up (§5b):** wire `webhook_delivery_*` + `storage_operation_*` metrics, add
+   the `db.query.duration` view. *Demo:* those panels populate.
+2. **Track 1b:** concurrent webhook delivery + pass `m` into the dispatcher. *Demo:*
+   `webhook_pending` drains under load → `experiments/0003`.
+3. **Track 1:** bounded worker pool honouring `MAX_CONCURRENT_JOBS` (pick process vs
+   thread, document why), preserving ack/idempotency/tracing invariants. *Demo:* μ
+   scales, queue stabilizes → `experiments/0002`.
+4. **Then** `XAUTOCLAIM` recovery + DLQ stream, and (optional) priority lanes.
+
+That order banks two quick, demoable wins (clean metrics + webhooks draining) before
+the larger concurrency change, and every step is provable by re-running the existing
+k6 profile against the Track 3 dashboards.
diff --git a/docs/enhancements/track-02-handoff.md b/docs/enhancements/track-02-handoff.md
new file mode 100644
index 0000000..4645124
--- /dev/null
+++ b/docs/enhancements/track-02-handoff.md
@@ -0,0 +1,138 @@
+# Track 2 — Queue-depth autoscaling — Session Handoff (start here)
+
+**Purpose:** everything a fresh conversation needs to begin **Track 2 (scale the
+worker fleet on queue lag)** without prior context. Read this top to bottom. It is
+the *operational* companion; pair it with a short design doc
+(`track-02-autoscaling.md`) written before coding, per the per-track design-doc
+philosophy. Assumes **Tracks 3, 1, and 1b are done** — tracing/SLOs/dashboards/k6
+exist, the worker is a bounded concurrent pool, and webhook delivery is concurrent.
+
+---
+
+## 1. What MPiper is (60-second orientation)
+
+Go **API** (`cmd/server`, `internal/`) accepts uploads; a Python **worker**
+(`worker/`) processes them. They talk over **Redis Streams** (`media:jobs`, group
+`worker-group`). **Postgres** is the source of truth; **MinIO** stores objects.
+Full orientation + topology + runbook live in
+[`track-01-handoff.md`](track-01-handoff.md) §1, §6, §7 — reuse them.
+
+**What Track 1 changed (your starting point):** the worker now runs a bounded
+`ThreadPoolExecutor` sized by `MAX_CONCURRENT_JOBS` (honour `mcj ≈ cores-per-pod`),
+recovers dead-consumer messages with `XAUTOCLAIM`, and dead-letters poison
+messages to `media:jobs:dlq`. Measured **2.37× throughput** at `mcj=4` on 4 cores
+(`experiments/0002`). Crucially: **per-pod throughput now scales with cores, so the
+next lever is more pods.**
+
+---
+
+## 2. The goal in one sentence
+
+Make the **number of worker pods** track the **Redis Streams backlog** (queue lag),
+so the pipeline absorbs bursts (scale up) and stops wasting capacity when idle
+(scale down) — a closed control loop driven by a real saturation signal, not CPU.
+
+> Why queue-lag, not CPU: CPU-based HPA reacts to *symptom* not *cause*, and lags
+> bursty I/O+transcode work. Queue depth / oldest-message-age is the direct
+> backpressure signal (Little's Law: `L = λW` — a growing `L` at fixed `W` means
+> `λ > μ`, i.e. add workers).
+
+---
+
+## 3. Prerequisites & gotchas verified in code (do these FIRST)
+
+- **The scaling signal that exists today is `queue.depth = XLEN`, which is NOT a
+  true backlog.** `RegisterQueueDepthFunc` *is* wired — in
+  `internal/queue/queue.go` (~L79), not `main.go` — and reports `XLEN media:jobs`.
+  But `XLEN` counts **all** stream entries, including acked-but-untrimmed ones
+  (`MaxStreamLength: 10_000` in `queue.NewRedisQueue`), so it stays high even when
+  the backlog is drained. **Don't autoscale on `queue.depth`.** `queue.processing.lag`
+  (a histogram, recorded in `queue.go` ~L177) measures per-message wait, not a
+  scalable gauge either.
+- **⚠️ Task 0: expose a true backlog signal.** Add a gauge for the consumer-group
+  **`lag`** (undelivered entries) and/or the **oldest-pending age** — e.g.
+  `XINFO GROUPS media:jobs` → `lag`, or `XPENDING` for the idle time of the oldest
+  pending entry. This is the signal a lag-driven scaler reads; `XLEN` will mislead it.
+  Decide and document which (lag vs age) drives scaling.
+- **k8s manifests already exist** but scale on the wrong signal: `deploy/k8s/worker-deployment.yaml`
+  has `replicas: 2` and a **`HorizontalPodAutoscaler` (min 2 / max 10) on CPU 75% /
+  mem 85%** — and **no `terminationGracePeriodSeconds`** (so it defaults to 30s).
+  Track 2 replaces/augments the HPA with a **lag-driven** scaler.
+- **`mcj ≈ cores-per-pod` (Track 1 lesson).** Autoscaling adds *pods*; each pod runs
+  its own thread pool. Don't crank `MAX_CONCURRENT_JOBS` — set it to the pod's CPU
+  limit and scale pod count. Oversubscription was measured to *reduce* throughput.
+- **Recovery interplay.** New pods join `worker-group` and read `>` (new messages);
+  a scaled-*down* pod's in-flight work is abandoned and reclaimed by `XAUTOCLAIM`
+  after `RECOVERY_MIN_IDLE_MS` (default 120s). For responsive scale-down, set the
+  pod's `terminationGracePeriodSeconds` ≥ the worker's `SHUTDOWN_DRAIN_TIMEOUT`
+  (default 30s, in `worker/consumer/main.py`) so in-flight jobs drain cleanly
+  instead of being abandoned and waiting out the 120s reclaim. Both default to 30s
+  today — tight; widen the grace period if jobs run longer.
+- **DB pool pressure.** N pods × `mcj` connections. Each pod sizes its pool to
+  `mcj + 2` (`worker/consumer/db.py`). At max replicas this can exceed Postgres'
+  `max_connections` — compute `maxReplicas × (mcj+2) + API pool` and cap accordingly
+  (watch `mpiper_db_connections_*`).
+
+---
+
+## 4. Engineering targets
+
+1. **Wire the scaling signal** (Task 0 above): record consumer-group lag (and/or
+   oldest-pending age) as a gauge, expose it where the scaler can read it.
+2. **Choose the scaler.** Options to weigh in the design doc:
+   - **KEDA `redis-streams` scaler** — purpose-built; scales a Deployment on
+     `pendingEntriesCount`/lag of a stream+group. Cleanest fit; needs KEDA installed.
+   - **Prometheus-adapter + HPA on a custom metric** (the lag gauge) — reuses the
+     existing Prometheus, no new operator, but more wiring.
+   - **A custom controller** — most work, most lesson; probably overkill.
+   Recommend KEDA `redis-streams`; document the tradeoff.
+3. **Tune the control loop.** Target lag per pod, `pollingInterval`,
+   `cooldownPeriod`/stabilization, min/max replicas. Avoid flapping (hysteresis).
+4. **Graceful scale-down.** Confirm SIGTERM → bounded drain → no lost work (relies on
+   Track 1's `shutdown(timeout)` + `XAUTOCLAIM` safety net).
+
+---
+
+## 5. Acceptance / how we'll know it worked
+
+Re-use the k6 harness and the consolidated overlay (`docker-compose.loadtest.yml`
+env knobs; `./loadtest/run.sh`). For k8s, run on the cluster the manifests target
+(or kind/minikube + KEDA).
+
+- **Backlog → scale-up → drain cycle:** drive `open --rate` above one pod's μ; the
+  scaler adds pods; aggregate μ rises ~linearly with pods (until CPU/DB-bound);
+  **lag rises then drains to ~0**; then load stops → pods **scale back down** after
+  cooldown.
+- **No flapping** under steady load; **no lost/double-processed jobs** across
+  scale events (verify via DB job counts + dedup; a scaled-down pod's job is
+  reclaimed, not dropped).
+- **DB pool stays under `max_connections`** at `maxReplicas`.
+- Write `experiments/0004-autoscaling.md` (0001 template: setup w/ pod & resource
+  limits → method → backlog/replica/lag timeseries → conclusion). Capture the
+  replica-count and lag panels.
+
+---
+
+## 6. Suggested first-session scope
+
+1. **Task 0:** add a **consumer-group lag** gauge (and/or oldest-pending age) — a
+   *new* observable gauge alongside the existing (misleading) `queue.depth=XLEN`;
+   wire it like `queue.go` wires `RegisterQueueDepthFunc`, plus a Grafana panel.
+   Prove it tracks a manually-`XADD`'d backlog and falls to ~0 on drain. *Demo:*
+   panel moves with `XADD`/drain (and, unlike `queue.depth`, returns to 0).
+2. **Design doc** `track-02-autoscaling.md`: signal choice (lag vs depth vs age),
+   scaler choice (KEDA vs prom-adapter), control-loop params, scale-down safety.
+3. **KEDA `ScaledObject`** on the worker Deployment driven by the lag signal;
+   replace the CPU HPA. Set `mcj` = pod CPU limit; min/max replicas; cooldown.
+4. **Load test the cycle** + `experiments/0004`.
+
+Banks a clean, demoable win (lag-driven scale-up/drain) on top of the now-concurrent
+worker, and is provable by re-running the existing k6 profile against the dashboards.
+
+---
+
+## 7. Key reads
+
+- [`track-01-handoff.md`](track-01-handoff.md) — topology, runbook, env, landmines (reuse).
+- [`experiments/0002-concurrent-worker.md`](../../experiments/0002-concurrent-worker.md) — the per-pod baseline μ to multiply by replica count.
+- `internal/queue/queue.go` (~L79 `RegisterQueueDepthFunc`→`XLen`, ~L177 `QueueProcessingLag`) — model the new lag gauge on this wiring; `internal/queue/redis.go` (`XLen`, add an `XInfoGroups`/`XPending` helper); `internal/metrics/metrics.go` (instrument defs); `deploy/k8s/worker-deployment.yaml` (current CPU HPA + missing `terminationGracePeriodSeconds`).
diff --git a/docs/enhancements/track-03-handoff.md b/docs/enhancements/track-03-handoff.md
new file mode 100644
index 0000000..089c030
--- /dev/null
+++ b/docs/enhancements/track-03-handoff.md
@@ -0,0 +1,222 @@
+# Track 3 — Session Handoff (start here)
+
+**Purpose:** everything a fresh conversation needs to begin **Track 3
+(end-to-end tracing, SLOs & local load testing)** without prior context. Read
+this top to bottom, then open `track-03-observability-and-load.md` for the full
+design and phased plan. This doc is the *operational* companion: where things
+are, how to run them, and the landmines already discovered.
+
+---
+
+## 1. What MPiper is (60-second orientation)
+
+A media-processing pipeline: a **Go API** (`cmd/server`, `internal/`) accepts
+uploads and a **Python worker** (`worker/`) processes them. They communicate over
+**Redis Streams** (`media:jobs`). **Postgres** is the durable source of truth;
+**MinIO** (S3-compatible) stores objects. Webhooks notify clients of job
+lifecycle events.
+
+**Asset flow:**
+`POST /api/v1/storage/presign` → client `PUT`s file to MinIO →
+`GET /api/v1/assets/{id}/complete` (writes asset `uploaded` + job + outbox row +
+`job.starting` webhook rows in one tx) → **outbox relay** (1s poll) publishes to
+Redis → **worker** consumes → image (3 webp variants) or video (poster + 720p +
+preview) → variants written to MinIO + Postgres, asset `ready` → worker inserts
+`job.started`/`job.done` webhook rows → **dispatcher** (2s poll) delivers signed POSTs.
+
+---
+
+## 2. The Track 3 goal in one sentence
+
+Make one **trace per asset** that spans API → Redis → worker → ffmpeg (so queue
+wait and per-stage time are visible), define a small set of **SLOs**, and build a
+**local k6 load harness** + Grafana dashboards so we can saturate the system on a
+laptop and *see* where it bends. Full plan: `track-03-observability-and-load.md`.
+
+---
+
+## 3. Current telemetry state (verified in code)
+
+- **Go API:** OTel **traces + metrics**, exported OTLP to `otel-collector:4317`.
+  Tracer init in `internal/metrics/otel.go`; metric instruments in
+  `internal/metrics/metrics.go`.
+- **Python worker:** OTel **metrics only** (`worker/utils/metrics.py`, OTLP to
+  `otel-collector:4317`). **No tracer, no span creation, no context extraction.**
+- **The gap:** the Go side traces the HTTP request and `Enqueue`, but **never
+  injects a `traceparent`** into the Redis message or the outbox row. The worker
+  therefore starts fresh with no parent. The trace dies at the queue boundary.
+- **Observability stack** (`docker-compose.observability.yml`, configs in
+  `observability/`): OTel Collector (bridges `mpiper_net` ↔ `mpiper_obs_net`),
+  **Tempo** (traces), **Prometheus** (metrics), **Loki + Promtail** (logs),
+  **Grafana** (dashboards, anonymous admin). Collector pipeline: OTLP receiver →
+  Tempo (traces) + Prometheus exporter `:8889` (metrics).
+
+> Note: `CLAUDE.md` historically said the worker is "prometheus_client (not OTel)"
+> — that's **stale/wrong**; the worker uses OTel metrics. Don't trust that line.
+
+---
+
+## 4. Exact engineering targets for Phase 1 (close the trace gap)
+
+These are the precise seams to touch. Verify each before editing.
+
+**Inject context (Go):**
+- `internal/queue/queue.go` — `RedisQueue.Enqueue` builds the stream message
+  (a `map`); the worker reads its fields. Inject `traceparent` (and `tracestate`)
+  here using the OTel propagator, as top-level message field(s).
+- `internal/outbox/relay.go` — `tick()` unmarshals the outbox row payload and calls
+  `queue.Enqueue(ctx, payload)`. Because enqueue is **store-and-forward**, the
+  trace context must survive in the **outbox row** too: capture it when the row is
+  written in `internal/service/asset.go` (`MarkAssetUploaded`, the
+  `outboxRepo.InsertTx` call), persist it (extend `internal/models/outbox.go` +
+  `internal/repository/outbox_repo.go` + a migration), and re-inject on relay.
+- **Verify the global propagator is set** in `internal/metrics/otel.go`
+  (`otel.SetTextMapPropagator(propagation.TraceContext{})`). If missing, add it —
+  without it, injection is a no-op.
+
+**Extract + continue (Python):**
+- Add `worker/utils/tracing.py` mirroring `worker/utils/metrics.py` (tracer init,
+  OTLP exporter to the same endpoint). Find where `init_metrics(...)` is called and
+  init the tracer alongside it (same lifecycle).
+- `worker/consumer/consumer.py` — in `consume()`, after the message payload is
+  normalized (note: a `body` field, if present, is JSON-decoded and merged), read
+  `traceparent` and start the consumer span. Use a **child span with a link** to
+  the producer context (link is the correct primitive for queue fan-in; child span
+  keeps the Tempo waterfall readable).
+
+**Span the stages (Phase 2):**
+- `worker/processing/processor.py` — `process_asset_dispatch` (download, dedup check).
+- `worker/processing/images.py` — per-variant encode/upload.
+- `worker/processing/videos.py` — `run()` wraps each ffmpeg call (poster / transcode_720p / preview).
+- Stamp `trace_id`/`span_id` into worker + API structured logs for Loki↔Tempo linking.
+
+**Message format reminder:** the consumer accepts either `job_id` (canonical) or
+`asset_id`. The outbox payload (built in `asset.go`) currently carries `job_id`,
+`asset_id`, `event`, `timestamp`. Add trace context as additional field(s); don't
+break the existing keys.
+
+---
+
+## 5. Environment & topology facts (host = macOS)
+
+**Host ports → containers:**
+| Service | Host | Container | Notes |
+|---|---|---|---|
+| API | 5010 | 5010 | `/healthz`, `/api/v1/...` |
+| Postgres | 5433 | 5432 | user `mpiper`, db `mpiper`, pw `changeme` |
+| Redis | 6380 | 6379 | stream `media:jobs`, group `worker-group` |
+| MinIO API | 9000 | 9000 | bucket `mpiper` (anon download on) |
+| MinIO console | 9001 | 9001 | minioadmin / minioadmin |
+| Grafana | 3000 | 3000 | anon admin |
+| Prometheus | 9090 | 9090 | |
+| Tempo | 3200 | 3200 | OTLP in on 4317/4318 (obs net) |
+| webhook-receiver | 8888 | 8080 | overlay only |
+
+**Container names:** `mpiper-api`, `mpiper-worker`, `mpiper-postgres`,
+`mpiper-redis`, `mpiper-minio`, `mpiper-webhook-receiver`, `mpiper-otel-collector`,
+`mpiper-tempo`, `mpiper-prometheus`, `mpiper-grafana`, `mpiper-loki`.
+
+**Storage split endpoint (implemented):** `S3_ENDPOINT_URL=http://minio:9000`
+(internal I/O) vs `S3_PUBLIC_ENDPOINT_URL=http://localhost:9000` (presigned +
+public URLs). Don't undo this — host-run load tests depend on it.
+
+**Telemetry env (`.env.local`):** `OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317`,
+`OTEL_TLS_INSECURE=true`, `TRACE_SAMPLING_RATE` (default 0.1 in code — **set to
+1.0 locally** so every asset traces). `ENCRYPTION_KEY=0123456789abcdef0123456789abcdef`
+(32 bytes; used for auth tokens AND webhook secrets).
+
+---
+
+## 6. Runbook / command cheat sheet
+
+```bash
+# Bring up core + observability (+ webhooks if you want webhook traces too)
+docker compose -f docker-compose.yml -f docker-compose.observability.yml up -d --build
+# add: -f docker-compose.webhooks.yml   (for webhook receiver)
+
+# End-to-end smoke (host-run; image + video + webhooks; 23 checks)
+./scripts/demo-e2e.sh
+
+# Go: build / vet / tests  (tests/performance_suite_test.go FAILS unless PERF_TEST_URL set — ignore)
+go build ./... && go vet ./... && go test ./...
+
+# Worker tests: the local .venv (py3.14) LACKS psycopg_pool/pytest/cryptography.
+# Run them INSIDE the worker container instead:
+docker exec -w /app mpiper-worker python -m unittest discover -s worker/tests -p 'test_*.py' -v
+
+# Mint an auth token from the host (system python3 has `cryptography`; venv does not):
+TOKEN=$(python3 - <<'PY'
+import base64, os
+from cryptography.hazmat.primitives.ciphers.aead import AESGCM
+key=b"0123456789abcdef0123456789abcdef"; nonce=os.urandom(12)
+print(base64.urlsafe_b64encode(nonce+AESGCM(key).encrypt(nonce,b"demo-user",None)).rstrip(b"=").decode())
+PY
+)
+
+# Inspect DB
+docker exec mpiper-postgres psql -U mpiper -d mpiper -c "SELECT asset_id,status,type FROM assets ORDER BY created_at DESC LIMIT 5;"
+
+# Reset all state (assets/variants/objects accumulate across runs)
+docker compose -f docker-compose.yml -f docker-compose.observability.yml down -v
+
+# UIs: Grafana http://localhost:3000 · Prometheus http://localhost:9090 · Tempo via Grafana Explore
+```
+
+---
+
+## 7. Landmines (things that already bit, or will)
+
+- **Worker is single-threaded.** `MAX_CONCURRENT_JOBS` is in `worker/consumer/
+  config.py` but **never used**; `consume()` does `count=1` and processes inline.
+  This is the expected bottleneck Phase 5 should prove — don't "fix" it in Track 3.
+- **Recovery is homegrown.** A 2-min DB scan re-`XADD`s stale jobs; no
+  `XPENDING`/`XAUTOCLAIM`; poison messages are marked `failed` and dropped (no DLQ).
+  That's Track 1, not Track 3.
+- **Global propagator may be unset** in Go — injection silently no-ops without it. Check first.
+- **Sampling.** Code default `TRACE_SAMPLING_RATE=0.1`. Set 1.0 locally or you'll
+  lose most traces and think propagation is broken.
+- **Dedup hides work.** Identical fixtures dedup after the first asset → near-zero
+  work on repeats. The load harness must **fan out unique bytes** to measure real
+  per-job cost.
+- **Cardinality.** asset_id is fine as a *trace/span attribute*; **never** put it on
+  a *metric* label.
+- **Health check.** `cmd/server --health-check` is now a real `/healthz` probe
+  (was previously booting a second server and failing to bind 5010 → api unhealthy
+  → worker wouldn't start). If you change startup, keep that path lightweight.
+- **Rebuild after code changes.** api/worker run from built images:
+  `docker compose ... build api worker && docker compose ... up -d`.
+- **Local ≠ prod.** Trust bottleneck *location* and before/after deltas, not
+  absolute throughput numbers.
+
+---
+
+## 8. Suggested first-session scope
+
+Do **Phase 0 + Phase 1** together (highest value, gets a real cross-boundary trace fast):
+
+1. **Phase 0:** add `deploy.resources.limits` (cpu/mem) to `api` + `worker` in a
+   compose overlay; set `TRACE_SAMPLING_RATE=1.0`; bring up with the observability
+   overlay; capture a baseline `demo-e2e.sh` run and confirm spans land in Tempo.
+2. **Phase 1:** Go `traceparent` injection (enqueue + outbox row + migration) →
+   worker tracer + extraction in `consume()`. 
+
+**Acceptance:** open one asset in Grafana/Tempo and see a single trace from
+`POST /storage/presign` through `enqueue` → (visible queue-wait gap) → worker
+`consume` span. That alone is a satisfying, demoable win.
+
+Then continue with Phases 2–5 from the design doc (pipeline spans + log
+correlation → SLO recording rules + dashboards → k6 harness → first experiment
+writeup that names the worker bottleneck, feeding Track 1).
+
+---
+
+## 9. Repo / git state at handoff
+
+- Branch: `feat/webhook-notifications`; open **PR #18**.
+- Demo-readiness + split-endpoint work is committed (`9404c7a`) and pushed.
+- `docs/enhancements/` (this file + `README.md` + `track-03-observability-and-load.md`)
+  may be **uncommitted** — commit them at the start of the Track 3 session.
+- Key docs to read: `docs/enhancements/README.md` (catalog),
+  `track-03-observability-and-load.md` (plan), `docs/arch/*` (existing outbox/
+  reliability design notes), `CLAUDE.md` (repo conventions; note the stale worker-
+  telemetry line).
diff --git a/docs/enhancements/track-03-observability-and-load.md b/docs/enhancements/track-03-observability-and-load.md
new file mode 100644
index 0000000..43ff9c9
--- /dev/null
+++ b/docs/enhancements/track-03-observability-and-load.md
@@ -0,0 +1,214 @@
+# Track 3 — End-to-end tracing, SLOs & local load testing
+
+**Status:** planning · **Prereq:** none · **Unlocks:** makes every other track measurable
+
+## 1. Problem
+
+We can't improve what we can't see. Right now:
+
+- The **distributed trace breaks at the Redis boundary.** The Go API traces the
+  HTTP request and the `Enqueue` call, but it never injects a `traceparent` into
+  the stream message. The worker has OTel **metrics** but **no tracer** and does
+  no context extraction. So we cannot answer "for *this* asset, where did the 40
+  seconds go?" as a single trace spanning API → outbox → Redis → worker → ffmpeg →
+  variant write.
+- We have metrics but **no SLOs** — no agreed definition of "good", so no way to
+  say whether a change helped.
+- We have **no way to generate controlled load**, so we've never seen the system
+  bend. The single-threaded worker (Track 1) is an invisible bottleneck until
+  something pushes on it.
+
+The user's real question: *this is a local project — how do we test under load
+and actually understand what's working, failing, and needs optimization?*
+
+That question is answered in §3.
+
+## 2. Goals / Non-goals
+
+**Goals**
+- One trace per asset, end to end, across the queue boundary, viewable in Tempo.
+- A small, explicit set of **SLIs and SLOs** for the pipeline.
+- A repeatable **local load harness** that can saturate the system on a laptop.
+- Grafana dashboards (RED for the API, USE for the worker/host, a pipeline-latency
+  funnel, queue lag) wired so a metric spike links to an example trace (exemplars).
+- A written **bottleneck-analysis loop**: load → observe → locate → optimize → re-run → compare.
+
+**Non-goals**
+- Production-scale absolute numbers. Local results are **relative** — they reveal
+  bottlenecks and validate *direction*, not real-world capacity (see §7).
+- Alerting/paging infrastructure (note SLO burn-rate alerts as a follow-up).
+- Replacing the existing stack — we extend the bundled Tempo/Prometheus/Loki/Grafana.
+
+## 3. Can you load-test meaningfully on a laptop? Yes — here's the methodology
+
+The misconception is that load testing requires cloud scale. It doesn't. Load
+testing is about **saturating the system relative to its own capacity** and
+watching where it bends. A single-threaded worker on a laptop saturates at a
+handful of concurrent jobs — you can absolutely push it past that locally.
+
+The thing that makes local results *interpretable* is **pinning resources** so
+runs are reproducible and the bottleneck isn't hidden by spare laptop cores. We
+add CPU/memory limits to the `api` and `worker` containers (compose `deploy.
+resources.limits`) so "the worker is the bottleneck" is a stable, observable fact
+rather than something that moves run to run.
+
+**The loop we're building:**
+
+```
+            ┌─────────────────────────────────────────────┐
+            │ 1. Define SLIs/SLOs (what "good" means)       │
+            └───────────────┬─────────────────────────────┘
+                            ▼
+            ┌─────────────────────────────────────────────┐
+            │ 2. Instrument end-to-end (close the trace gap)│
+            └───────────────┬─────────────────────────────┘
+                            ▼
+   ┌────────────┐   generate    ┌─────────────────────────┐
+   │ k6 (host)  │ ────────────▶ │ MPiper (CPU-pinned)      │
+   │ load model │   presign→PUT │  API + worker            │
+   └────────────┘   →complete   └───────────┬─────────────┘
+        │ client-side metrics               │ app OTel traces+metrics
+        ▼                                    ▼
+   ┌─────────────────────────────────────────────────────┐
+   │ 3. Observe in Grafana: RED, USE, pipeline funnel,     │
+   │    queue lag — metric spike → exemplar trace in Tempo │
+   └───────────────┬─────────────────────────────────────┘
+                   ▼
+   ┌─────────────────────────────────────────────────────┐
+   │ 4. Locate bottleneck (trace waterfall + USE) →        │
+   │    optimize → re-run same profile → compare           │
+   └─────────────────────────────────────────────────────┘
+```
+
+### Load model (this is the subtle part)
+
+- **Closed model (fixed VUs):** N virtual users each loop presign→upload→complete
+  as fast as they can. Good for finding max throughput and saturation point.
+- **Open model (fixed arrival rate):** X new uploads/sec regardless of how fast the
+  system responds. Good for finding the **latency knee** and watching queue lag
+  grow when arrival rate > service rate (a live demonstration of Little's Law:
+  `L = λW`).
+
+We use **k6** run from the **host** (like `scripts/demo-e2e.sh`): the host can
+reach both the API (`localhost:5010`) and MinIO (`localhost:9000`), so k6 performs
+the *real* client flow — presign, `PUT` the file to the public endpoint, then
+`complete`. k6 uploads real fixtures (the existing image + `tests/test_assets/
+sample.mp4`), optionally fanning out copies with unique bytes to defeat content-hash
+dedup when we want true per-job work.
+
+Two views of the same run:
+- **Client view** (k6's own metrics): request rate, error rate, client-side
+  latency percentiles → remote-written to the bundled Prometheus.
+- **Server view** (MPiper's OTel): the pipeline's internal spans and metrics —
+  this is the point of the track, and what we'll mostly read.
+
+## 4. Design
+
+### 4.1 Close the trace gap (the core engineering work)
+
+1. **Inject context on enqueue (Go).** When the outbox relay (or `RedisQueue.
+   Enqueue`) publishes, inject the active span context as a `traceparent` field in
+   the stream message using the OTel propagator. The outbox row should carry the
+   trace context too (so the trace survives the store-and-forward hop).
+2. **Extract + continue on consume (Python).** Add an OTel **tracer** to the worker
+   (mirroring `worker/utils/metrics.py`). In `consume()`, extract `traceparent`
+   from the message and start the consumer span as a **child** (a span link is the
+   correct primitive for queue fan-in; we'll use a child span with a link to keep
+   the waterfall readable).
+3. **Span the pipeline stages.** Wrap `process_asset_dispatch`, download,
+   dedup-check, each image variant, and each ffmpeg invocation (poster / transcode /
+   preview) in spans with attributes (asset_id, type, bytes, role, ffmpeg rc).
+4. **Correlate logs.** Stamp `trace_id`/`span_id` into worker + API structured logs
+   so Loki ↔ Tempo cross-linking works in Grafana.
+
+End result: open an asset in Tempo and see `HTTP POST /presign … → enqueue →
+(time in queue) → worker consume → download → transcode_720p → write variant`,
+with the **queue wait time** visible as the gap between enqueue and consume.
+
+### 4.2 SLIs / SLOs (initial, deliberately small)
+
+| SLI | Definition | Initial SLO (local) |
+|-----|------------|---------------------|
+| Presign latency | p95 of `POST /storage/presign` | < 150 ms |
+| Image ready latency | p95 (complete → asset `ready`) for images | < 5 s |
+| Video ready latency | p95 (complete → asset `ready`) for videos | < 60 s |
+| Queue wait | p95 (enqueue → consume start) | < 2 s |
+| Job success rate | done / (done + failed) | > 99% |
+| Webhook delivery latency | p95 (event row created → delivered) | < 10 s |
+
+These come straight from spans/metrics we'll have. The numbers are starting
+guesses; the *point* is to make them explicit, then move them based on data.
+
+### 4.3 Dashboards (Grafana, provisioned in `observability/grafana/dashboards`)
+
+- **API — RED:** request **R**ate, **E**rror rate, **D**uration (p50/p95/p99) per route.
+- **Worker/host — USE:** CPU/mem **U**tilization, **S**aturation (queue depth,
+  in-flight jobs), **E**rrors. (cAdvisor/node metrics or the collector's own.)
+- **Pipeline funnel:** uploaded → processing → ready/failed counts + the
+  per-stage latency breakdown (from spans).
+- **Queue health:** stream length, oldest-pending age, outbox relay lag (metric
+  already exists), webhook pending gauge (already exists).
+- **Exemplars:** histogram panels link a bucket spike to a concrete Tempo trace.
+
+### 4.4 Bottleneck-analysis loop (documented runbook)
+
+For each experiment: fix a load profile, run it, then read in order — (1) is the
+SLO breached? (2) USE: is the worker CPU-saturated or queue-saturated? (3) open an
+exemplar trace: which span dominates? (4) form a hypothesis, change one thing,
+re-run the **same** profile, compare. Record results in an `experiments/` log so
+"the transcode span dropped from 38s→6s after X" is captured.
+
+## 5. Phased implementation plan
+
+Each phase is independently demoable.
+
+- **Phase 0 — Resource pinning & baseline.** Add `deploy.resources.limits` to api/
+  worker; bring up the observability overlay; capture a one-shot baseline with the
+  existing `demo-e2e.sh`. *Demo:* Grafana shows the run; numbers are reproducible.
+- **Phase 1 — Trace propagation.** Inject `traceparent` on enqueue (Go) + outbox
+  row; extract + continue in the worker; add the worker tracer. *Demo:* a single
+  Tempo trace spans API→worker for one asset, with visible queue wait.
+- **Phase 2 — Pipeline spans + log correlation.** Span dispatch/download/dedup/
+  each variant/each ffmpeg call; add trace IDs to logs. *Demo:* trace waterfall
+  shows per-stage timing; click a log line → its trace.
+- **Phase 3 — SLO recording rules + dashboards.** Prometheus recording rules for
+  the SLIs in §4.2; provision the four dashboards. *Demo:* a dashboard shows each
+  SLI vs its SLO target.
+- **Phase 4 — k6 load harness.** `loadtest/` with closed- and open-model scripts,
+  a host-run wrapper, fixture fan-out, and k6→Prometheus remote write. *Demo:*
+  `./loadtest/run.sh open --rate 5/s --duration 3m` drives the system; Grafana
+  shows queue lag climbing and the latency knee.
+- **Phase 5 — First experiment writeup.** Run a saturating profile, capture the
+  bottleneck (expected: the single-threaded worker), and write it up as the
+  motivating evidence for **Track 1**. *Demo:* `experiments/0001-worker-saturation.md`
+  with before numbers + the trace proving where time goes.
+
+## 6. How we'll know it works (acceptance)
+
+- A Tempo trace for any asset includes both API and worker spans, with queue wait
+  time visible.
+- Every SLI in §4.2 renders on a dashboard against its target.
+- `loadtest/run.sh` reproducibly drives the system into SLO breach, and the
+  responsible stage is identifiable from a trace within ~2 minutes of looking.
+- Phase 5 writeup names the bottleneck with evidence — the input to Track 1.
+
+## 7. Risks & honest caveats
+
+- **Local ≠ production.** Absolute numbers are not portable (laptop CPU, no network
+  latency, single-node Redis/PG). Treat results as **relative**: bottleneck
+  location and before/after deltas are trustworthy; "we do N uploads/sec" is not.
+- **Noisy neighbor.** k6, the app, and the observability stack share the laptop.
+  Pin app resources and keep k6 modest; consider running k6 with `--throw` budgets.
+- **Container CPU limits change behavior** (e.g. ffmpeg threads). That's fine — it's
+  what makes runs comparable — but document the limits with each experiment.
+- **Trace cardinality / sampling.** Asset-ID attributes are high-cardinality on
+  *traces* (OK) but must never become metric labels. Keep `TRACE_SAMPLING_RATE`
+  in mind; sample at 100% locally, lower in prod.
+- **Dedup hides work.** Identical fixtures dedup after first processing; the load
+  harness must fan out unique bytes when measuring real per-job cost.
+
+## 8. Follow-ups (out of scope here)
+
+- SLO **burn-rate alerting** (multi-window) once SLOs stabilize.
+- Continuous profiling (Pyroscope) to attribute CPU *inside* a span.
+- CI smoke load test with a latency budget (feeds Track 8).
diff --git a/docs/enhancements/track-04-handoff.md b/docs/enhancements/track-04-handoff.md
new file mode 100644
index 0000000..74d7391
--- /dev/null
+++ b/docs/enhancements/track-04-handoff.md
@@ -0,0 +1,128 @@
+# Track 4 — Multi-tenancy, auth & quotas — Session Handoff (start here)
+
+**Purpose:** everything a fresh conversation needs to begin **Track 4** without
+prior context. Read top to bottom, then write a short design doc
+(`track-04-multitenancy-auth.md`) before coding, per the per-track philosophy.
+**Fully local — no k8s required** (this is why it was picked ahead of Track 2,
+which needs a cluster). Assumes Tracks 3, 1, 1b done.
+
+---
+
+## 1. What MPiper is (60-second orientation)
+
+Go **API** (`cmd/server`, `internal/`) + Python **worker** (`worker/`) over
+**Redis Streams**; **Postgres** is source of truth; **MinIO** stores objects.
+Full orientation, topology, runbook, env, landmines: reuse
+[`track-01-handoff.md`](track-01-handoff.md) §1, §6, §7.
+
+---
+
+## 2. The goal in one sentence
+
+Turn the single-user, best-effort API into one that safely serves **multiple
+tenants** — real authN/authZ (expiring, rotatable credentials), **tenant
+isolation** on every asset read/write, **idempotency keys** so client retries
+don't duplicate work, and **per-tenant quotas/rate limits**.
+
+---
+
+## 3. Current state & gotchas — verified in code (do these FIRST)
+
+- **Auth is a homegrown AES-256-GCM token with no expiry or rotation.**
+  `pkg/utils/crypt.go` `GenerateToken/DecryptToken` encrypts *just the userID*;
+  `internal/middleware/authorization.go` decrypts it with `config.MustGet().EncryptionKey`.
+  Problems: **no `exp`/issued-at**, no key rotation, opaque (no claims), and the
+  middleware comment says "Invalid or expired token" but **nothing ever expires**.
+- **⚠️ One key signs everything.** The same `ENCRYPTION_KEY` (exactly 32 bytes)
+  encrypts **auth tokens AND webhook secrets** (webhook secrets are stored
+  encrypted with it; see `internal/service/webhook.go` + the dispatcher's
+  `DecryptToken`). Leaking it compromises both. **Separate the webhook-signing key
+  from the auth-signing key** early — it touches stored data, so plan a migration.
+- **Tenant tagging exists but is shallow.** `assets.owner_id` was added
+  (`migrations/000004_assets_owner_id.up.sql`) and `internal/service/asset.go`
+  (~L128) sets it from `middleware.GetUserID(ctx)` on create. The webhook→asset
+  join already scopes by it (`JOIN assets a ON a.owner_id = wr.user_id`).
+- **⚠️ But reads/writes are NOT consistently owner-scoped — likely IDOR.** Verify
+  every asset path (`GET /assets/{id}/complete`, any asset fetch, variant lookups)
+  filters by `owner_id` = caller. The repo queries (`internal/repository/asset_repo.go`)
+  fetch by `asset_id` alone in places. **Task: enforce tenant scoping at the
+  repository layer** so a caller can never touch another tenant's asset by ID.
+- **No idempotency keys.** A retried `POST /storage/presign` creates a **duplicate
+  asset** every time (confirmed gap — `docs/arch/reliability-and-correctness.md`
+  §"Idempotency today", gap #7). There is no `Idempotency-Key` handling and no
+  store for replaying prior responses.
+- **Flat tenancy + single bucket.** No org→project hierarchy; one MinIO bucket with
+  path prefixes (`media/raw/<assetId>`). No per-tenant prefix/credentials.
+- **No quotas or per-tenant rate limits.** Any token can submit unbounded work.
+
+---
+
+## 4. Engineering targets (suggested order — highest value / lowest risk first)
+
+1. **Idempotency keys (Stripe-style).** Accept an `Idempotency-Key` header on
+   `presign` (and `complete`); store `(tenant, key) → asset_id/response` with a TTL;
+   on replay within TTL return the **same** asset + response instead of creating a
+   new one. Decide: key storage table + TTL, response replay vs just dedup, scope
+   (per-tenant). *Teaches: the idempotency pattern, retry-safety.*
+2. **Tenant isolation at the repository layer.** Thread tenant id through context
+   → every asset query gets a `WHERE owner_id = $tenant` (or `tenant_id`) guard;
+   add tests that a cross-tenant fetch 404s. Close the IDOR. Add per-tenant storage
+   **prefixes** (`media/<tenant>/raw/...`).
+3. **Real auth.** Either **JWT** (asymmetric keys, `exp`, JWKS rotation) or scoped
+   **API keys** (hashed at rest, revocable). Add expiry + rotation; **split the
+   webhook-signing secret from the auth key** (migration for existing encrypted
+   webhook secrets). Keep the middleware contract (`GetUserID` → now `GetTenant`/claims).
+4. **Quotas + rate limits.** Per-tenant request rate limit (middleware) and usage
+   accounting (e.g. assets/storage per tenant) with enforcement on `presign`.
+   *Teaches: backpressure at the edge, usage metering.*
+
+> Pick a tenancy model up front and document it: minimal is keep `owner_id` =
+> tenant; fuller is `org → project → asset` with row scoping. Don't over-build —
+> the IDOR fix + idempotency are the high-value core.
+
+## 5. Acceptance / how we'll know it worked
+
+- **Idempotency:** same `Idempotency-Key` replayed → one asset, identical response;
+  different key → new asset. Test under concurrent duplicate requests (no race dupes).
+- **Isolation:** tenant A cannot read/complete/lookup tenant B's asset by ID
+  (returns 404/403); storage objects land under the tenant prefix. Add repo-level +
+  HTTP-level tests.
+- **Auth:** expired token rejected; rotated signing key still validates
+  unexpired tokens (JWKS/keyset); webhook secrets decrypt with their *own* key
+  post-migration.
+- **Quotas:** a tenant over its limit is throttled/429'd; usage metric per tenant.
+- No load test needed, but add a security-focused test suite. Optionally write
+  `experiments/0004-tenancy.md` documenting the IDOR-before/after.
+
+## 6. Landmines
+
+- **Key-split migration:** existing `webhook_registrations.secret` rows are
+  encrypted with `ENCRYPTION_KEY`. Splitting keys means re-encrypting them — plan a
+  one-time migration or dual-read window. Don't strand existing registrations.
+- **Don't break the local token-minting path:** `scripts/demo-e2e.sh`,
+  `loadtest/run.sh`, and the README all mint the current AES token inline with a
+  Python snippet. If you change the token format, update all three or provide a
+  compatibility shim, or the demo + load harness break.
+- **Context plumbing:** `middleware.GetUserID(ctx)` is the single chokepoint —
+  extend it to carry tenant/claims rather than scattering token parsing.
+- **Worker side:** the worker also reads `owner_id` (webhook join in
+  `worker/webhooks.py`); a tenancy-column rename ripples into the worker SQL + its
+  tests. Grep both services.
+- **`ENCRYPTION_KEY` is required at boot** (config panics without it, exactly 32
+  bytes) — keep that contract or update config validation + all envs.
+
+## 7. First-session scope
+
+1. **Design doc** `track-04-multitenancy-auth.md`: tenancy model (flat owner vs
+   org/project), auth choice (JWT vs API keys), idempotency-key storage + TTL,
+   key-split migration plan.
+2. **Idempotency keys** on `presign`/`complete` (highest value, self-contained).
+   *Demo:* replayed key → one asset.
+3. **Repository-layer tenant scoping** + the IDOR test. *Demo:* cross-tenant fetch 404s.
+4. **Auth hardening** (expiry + rotation + key split) and **quotas** as follow-ups.
+
+## 8. Key reads
+
+- [`track-01-handoff.md`](track-01-handoff.md) — topology/runbook/env (reuse).
+- `docs/arch/reliability-and-correctness.md` — §"Idempotency today" + the gap table (gap #7 client idempotency keys; replay protection).
+- `pkg/utils/crypt.go` (token gen/decrypt), `internal/middleware/authorization.go` (`GetUserID` chokepoint), `internal/service/asset.go` (~L128 owner_id set; presign/complete flow), `internal/repository/asset_repo.go` (asset queries to scope), `internal/service/webhook.go` + `internal/webhook/dispatcher.go` (shared-key webhook secrets), `migrations/000004_assets_owner_id.*`.
diff --git a/docs/enhancements/track-04-multitenancy-auth.md b/docs/enhancements/track-04-multitenancy-auth.md
new file mode 100644
index 0000000..19d6148
--- /dev/null
+++ b/docs/enhancements/track-04-multitenancy-auth.md
@@ -0,0 +1,161 @@
+# Track 4 — Multi-tenancy, Auth & Quotas — Design
+
+**Status:** accepted · **Scope:** fully local (no k8s) · **Predecessor:** see
+[`track-04-handoff.md`](track-04-handoff.md).
+
+This document records the confirmed design decisions for Track 4 before
+implementation. It turns the single-user, best-effort API into one that safely
+serves multiple tenants: real authN/authZ, tenant isolation on every asset
+operation, idempotency keys, and per-tenant quotas/rate limits.
+
+---
+
+## 1. Tenancy model — flat `owner_id`
+
+**Decision:** flat tenancy. `assets.owner_id` (TEXT) **is** the tenant
+identifier. No `org → project → asset` hierarchy.
+
+- The column **keeps its name** (`owner_id`). Renaming it to `tenant_id` would
+  ripple into the worker (`worker/webhooks.py` and its test assert the exact SQL
+  `JOIN assets a ON a.owner_id = wr.user_id`). Keeping the name means **zero
+  worker churn**. "tenant" is only the in-process *concept*; on disk it stays
+  `owner_id` / `webhook_registrations.user_id`.
+- The tenant identifier is a free-form TEXT string sourced from the API key (see
+  §2). There is no separate `users`/`tenants` table — the `api_keys` table is the
+  identity source of record.
+
+**Rejected:** `org → project` hierarchy — more schema + bigger worker/webhook
+ripple for no near-term value. The high-value core is the IDOR fix +
+idempotency, not a richer tenancy graph.
+
+## 2. Auth — scoped API keys (drop the AES token)
+
+**Decision:** replace the homegrown AES-256-GCM token with **scoped API keys**.
+
+- **Wire format:** `mp_<prefix>_<secret>` where `prefix` is a short public
+  identifier (used to narrow lookups and shown in listings) and `secret` is
+  high-entropy random.
+- **At rest:** only the **SHA-256 hash** of the full key is stored
+  (`api_keys.key_hash`, UNIQUE). API keys are high-entropy, so a fast hash with
+  an indexed equality lookup is appropriate (bcrypt is for low-entropy
+  passwords and cannot be indexed). The plaintext key is shown **once** at mint
+  time and never persisted.
+- **Lifecycle:** keys carry optional `expires_at` and `revoked_at`. The auth
+  middleware rejects missing/unknown/expired/revoked keys with `401`.
+- **Scopes:** `scopes JSONB` is carried for future authorization granularity;
+  initially keys are minted with a broad scope.
+- **Context contract:** the single chokepoint `middleware.GetUserID` is renamed
+  to `middleware.GetTenant`, returning the tenant id (and scopes) from the
+  validated key. All call sites move to `GetTenant`.
+- **No HTTP admin surface.** Keys are minted out-of-band via a CLI
+  (`cmd/mint-api-key`, wrapped by `scripts/mint-api-key.sh`) that inserts a row
+  and prints the plaintext once.
+
+The old AES auth path (`utils.GenerateToken`/`DecryptToken` for auth) is
+**removed**. The demo/loadtest/test-webhooks scripts and the README are cut over
+to mint and use an API key.
+
+### `api_keys` schema
+
+| column       | type          | notes                                  |
+|--------------|---------------|----------------------------------------|
+| `id`         | UUID PK       | `gen_random_uuid()` / `uuid_generate_v4()` |
+| `tenant_id`  | TEXT NOT NULL | the tenant identifier (== `owner_id`)  |
+| `key_hash`   | TEXT UNIQUE NOT NULL | SHA-256 hex of full `mp_..._...` key |
+| `prefix`     | TEXT NOT NULL | public, indexed; narrows lookup        |
+| `scopes`     | JSONB NOT NULL DEFAULT `'[]'` | reserved for authz     |
+| `expires_at` | TIMESTAMPTZ   | NULL = never expires                   |
+| `revoked_at` | TIMESTAMPTZ   | NULL = active                          |
+| `created_at` | TIMESTAMPTZ NOT NULL DEFAULT now() |                   |
+
+Indexes on `prefix` and `key_hash`.
+
+## 3. Key split — separate webhook-signing key from auth key
+
+**Decision:** introduce `WEBHOOK_ENCRYPTION_KEY` (32 bytes). Webhook secrets are
+encrypted/decrypted with it (`service/webhook.go` + `webhook/dispatcher.go`)
+instead of the shared `ENCRYPTION_KEY`. After the API-key cutover, `ENCRYPTION_KEY`
+is no longer used for auth; the webhook key fully owns webhook-secret encryption,
+so a leak of one no longer compromises the other.
+
+**Migration:** because the project is **pre-launch and local**, existing
+`webhook_registrations` rows are **truncated** rather than re-encrypted — no
+dual-read window or one-time re-encrypt pass is needed.
+
+## 4. Tenant isolation — repository-layer scoping (close the IDOR)
+
+**Decision:** enforce `WHERE owner_id = $tenant` at the repository layer so a
+caller can never touch another tenant's asset by ID.
+
+- The verified IDOR surface today is the **`complete` write path**:
+  `MarkAssetUploadedTx` updates by `asset_id` alone. The owner guard is added:
+  `WHERE asset_id = $1 AND owner_id = $tenant AND status = 'uploading'`.
+- The service maps "0 rows / not owned" to a **404** (indistinguishable from a
+  non-existent asset — no cross-tenant existence leak).
+- The tenant id is threaded from context (`GetTenant`) into the asset
+  service/repo calls.
+
+**Migration:** delete existing `assets` (and dependent rows) — pre-launch local
+data — then `ALTER COLUMN owner_id SET NOT NULL` and add index `idx_assets_owner`.
+
+### Deferred: per-tenant storage prefix
+
+Per-tenant object prefixes (`media/<tenant>/raw/...`) are **out of scope** for
+this track. The worker reconstructs `media/raw/<assetId>`
+(`worker/processing/processor.py`) and its processed-output keys from `asset_id`
+without selecting `owner_id`; prefixing would break the worker's download/upload
+paths and require threading tenant through the worker SQL + key construction +
+tests — exactly the worker churn the flat-tenancy decision avoids. Asset IDs are
+UUIDs, access is gated by presigned URLs, and the IDOR is closed at the DB layer,
+so per-tenant prefixing is defense-in-depth, deferrable to a later track.
+
+## 5. Idempotency keys — full response replay, 24h TTL
+
+**Decision:** Stripe-style idempotency on `presign` (and `complete` when the
+header is present), scoped per-tenant, with **full response replay**.
+
+- **Header:** `Idempotency-Key`. Absent → behave exactly as today (no-op).
+- **Storage:** `idempotency_keys` table, PK `(tenant_id, key)`:
+
+  | column                | type         | notes                                |
+  |-----------------------|--------------|--------------------------------------|
+  | `tenant_id`           | TEXT         | part of PK                           |
+  | `key`                 | TEXT         | part of PK                           |
+  | `request_fingerprint` | TEXT         | hash of method+path+body             |
+  | `status`              | TEXT         | `pending` / `done`                   |
+  | `response_status`     | INT          | replayed HTTP status                 |
+  | `response_body`       | JSONB        | replayed body                        |
+  | `asset_id`            | UUID         | created asset (nullable)             |
+  | `created_at`          | TIMESTAMPTZ  |                                      |
+  | `expires_at`          | TIMESTAMPTZ  | `created_at + 24h`                   |
+
+- **Concurrency:** the first request inserts a `pending` row; the PK unique
+  constraint is the lock. Concurrent duplicates that collide on the `pending`
+  row get `409` (in-flight). Once `done`, replays within TTL return the stored
+  response **verbatim**.
+- **Reuse with a different payload:** same `(tenant, key)` but a different
+  `request_fingerprint` → `422` (key reused for a different request).
+- **TTL:** 24h. A background sweep deletes expired rows.
+
+## 6. Quotas + rate limits — per-tenant
+
+**Decision:** per-tenant token-bucket rate limiting (keyed by `tenant_id`,
+replacing/extending the existing per-IP `presignRateLimiter`) returning `429`
+with `Retry-After`, plus usage accounting (assets and/or bytes per tenant)
+checked on `presign` with over-quota rejection. Limits are config-driven with
+sane defaults, and a per-tenant usage metric is exposed.
+
+## 7. Sequencing
+
+Auth (Tasks 1–3) lands first because idempotency and tenant scoping both depend
+on a real tenant in context, and no identity table existed before. Then tenant
+scoping (Task 4) closes the IDOR, and idempotency (Task 5) + quotas (Task 6)
+layer on top. Each task is independently demoable.
+
+## 8. Compatibility / landmines
+
+- `ENCRYPTION_KEY` remains a required 32-byte boot config (webhook key split
+  adds `WEBHOOK_ENCRYPTION_KEY` alongside it).
+- Three scripts + README mint the old AES token inline; all are cut over to API
+  keys in Task 2/3.
+- `owner_id` column name is preserved → worker untouched.
diff --git a/experiments/0001-worker-saturation.md b/experiments/0001-worker-saturation.md
new file mode 100644
index 0000000..8214ee6
--- /dev/null
+++ b/experiments/0001-worker-saturation.md
@@ -0,0 +1,117 @@
+# Experiment 0001 — Worker saturation under load
+
+**Date:** 2026-06-30 · **Track:** 3 (observability & load) · **Feeds:** Track 1 (concurrent worker)
+**Status:** complete
+
+## Hypothesis
+
+The Python worker is single-threaded (`consume()` reads one message, `count=1`,
+and processes it inline; `MAX_CONCURRENT_JOBS` exists in config but is unused).
+Under sustained upload load the worker — not the API — should be the bottleneck,
+with the Redis stream growing without bound once arrival rate exceeds the
+worker's service rate (Little's Law, `L = λW`).
+
+## Setup (record this with every run)
+
+- **Resource pinning** (`docker-compose.loadtest.yml`): `api` = 1.0 CPU / 512 MB,
+  `worker` = **1.0 CPU** / 1 GB. The single-CPU pin makes the bottleneck a stable,
+  observable fact rather than something that moves with spare laptop cores.
+- **Sampling:** `TRACE_SAMPLING_RATE=1.0` (every asset traced).
+- **Stack:** core + observability + loadtest overlays, all up.
+- **Workload:** images only, unique bytes per iteration (dedup defeated — see
+  `loadtest/lib.js`). Fixture `worker/tests/test_assets/image.jpg`, 3 webp
+  variants per asset.
+- **Profile:** open model, `./loadtest/run.sh open --rate 10/s --duration 60s`
+  (fixed arrival rate; λ = 10 uploads/s).
+
+> Local results are **relative**. Trust the bottleneck location and the
+> before/after deltas, not the absolute throughput — laptop CPU, no network
+> latency, single-node Redis/Postgres.
+
+## Method (the loop)
+
+1. Is an SLO breached? 2. USE — is the worker CPU- or queue-saturated?
+3. Open an exemplar trace — which span dominates? 4. Form a hypothesis, change
+one thing, re-run the **same** profile, compare.
+
+## Results (before — no optimisation yet)
+
+| Signal | Value | Source |
+|--------|-------|--------|
+| Arrival rate (λ) | 10.0 uploads/s | k6 `mpiper_assets_submitted` |
+| Worker service rate (μ) | **1.13 jobs/s** | `rate(mpiper_mpiper_job_processing_success_total[2m])` |
+| Mean asset processing time | 0.81 s | `…_duration_seconds_sum / …_count` |
+| Queue depth before → after | 3985 → 4370 (↑) | `sli:queue_depth:current` |
+| Worker CPU | **98.5 %** (pinned at 1 CPU) | `docker stats` |
+| API CPU | 0.4 % | `docker stats` |
+| Presign p95 (API) | 48 ms (SLO < 150 ms ✅) | `sli:presign_latency_seconds:p95` |
+| Job success rate | 1.0 (SLO > 99 % ✅) | `sli:job_success_ratio:ratio_rate5m` |
+
+**Reading:** λ (10/s) ≫ μ (1.13/s). The queue grows monotonically; the system is
+**unstable for any arrival rate above ≈ 1.1 uploads/s**. The API is essentially
+idle (0.4 % CPU, presign well inside SLO) while the worker is pinned at 98.5 %.
+The bottleneck is unambiguously the worker, and specifically its
+**single-threaded, one-job-at-a-time** processing loop — not CPU work that is
+inherently slow (a single image is ~0.8 s), but the complete absence of
+concurrency.
+
+## Trace evidence (where the time goes)
+
+With the trace gap now closed (Track 3, Phase 1), one asset is a single trace
+from the API through the queue into the worker — example, 19 spans:
+
+```
+/api/v1/assets/{id}/complete                 (API HTTP request)
+└ AssetHandler.MarkAssetUploaded
+  └ AssetService.MarkAssetUploaded
+    ├ StorageClient.GetObjectAttrs → S3.GetObjectAttrs
+    └ Database.Transaction
+      ├ AssetRepo.MarkAssetUploadedTx
+      ├ AssetRepo.InsertProcessAssetJobTx
+      └ OutboxRepo.InsertTx
+        └ outbox.publish                       (relay re-activates stored context)
+          └ RedisQueue.Enqueue                 (injects traceparent into the message)
+            ├ RedisQueue.doXAddWithRetry
+            └ worker.consume                   (── crosses the Redis boundary ──)
+              └ process.dispatch
+                ├ process.download
+                ├ process.dedup_check
+                └ image.variant × 3
+```
+
+The **gap between `RedisQueue.Enqueue` and `worker.consume`** is the queue wait —
+the time an asset spends backed up behind the single worker. Under this profile
+that gap dominates end-to-end latency, and it grows for every asset because the
+backlog only ever increases. The in-worker stages (download, dedup, 3 variants)
+are individually fast; the cost is waiting for a free worker, not the work itself.
+
+## Conclusion
+
+The single-threaded worker is the bottleneck, with a service rate of ~1.1
+jobs/s. The pipeline cannot keep up with anything beyond a trickle of uploads,
+and the deficit manifests as an unbounded Redis backlog and ever-growing
+queue-wait latency — while the API and host CPU sit idle. This is the motivating
+evidence for **Track 1 (concurrent worker + stream recovery + DLQ)**: honour
+`MAX_CONCURRENT_JOBS` as a real concurrency limit (process pool for the
+CPU-bound Pillow/ffmpeg work) so μ scales with available cores instead of being
+fixed at one.
+
+## Reproduce
+
+```bash
+docker compose -f docker-compose.yml -f docker-compose.observability.yml \
+  -f docker-compose.loadtest.yml up -d --build
+./loadtest/run.sh open --rate 10/s --duration 60s
+# Grafana http://localhost:3000 → MPiper folder:
+#   - "Worker / App Saturation (USE)": queue depth climbing, in-flight pinned
+#   - "Pipeline Funnel": ready/s flat at ~1.1 while uploaded/s tracks arrival
+# Tempo (Explore): TraceQL `{ name="worker.consume" }` → open one → see the
+#   enqueue→consume queue-wait gap and the per-stage breakdown.
+```
+
+## Next experiment
+
+After Track 1 lands a bounded worker pool, re-run this **exact** profile and
+compare: μ should rise roughly with the pool size (until CPU-bound), queue depth
+should stabilise instead of growing, and the enqueue→consume gap should shrink.
+Record results as `0002-concurrent-worker.md`.
diff --git a/experiments/0002-concurrent-worker.md b/experiments/0002-concurrent-worker.md
new file mode 100644
index 0000000..2ecceda
--- /dev/null
+++ b/experiments/0002-concurrent-worker.md
@@ -0,0 +1,200 @@
+# Experiment 0002 — Concurrent worker
+
+**Date:** 2026-06-30 · **Track:** 1 (concurrent worker + recovery + DLQ) · **Follows:** 0001
+**Status:** implementation complete; **after-load numbers pending a live run** (see *Results (after)*).
+
+## Hypothesis
+
+0001 proved the worker is the bottleneck: a single-threaded loop with a service
+rate μ ≈ 1.1 jobs/s while the API sits idle and the Redis backlog grows without
+bound. `MAX_CONCURRENT_JOBS` existed in config but was never used.
+
+Honouring `MAX_CONCURRENT_JOBS` as a real concurrency limit (a bounded worker
+pool) should raise μ roughly **N×** (until the worker becomes CPU-bound on its
+allotted cores), at which point the queue **stabilises/drains** for any arrival
+rate λ ≤ μ instead of growing. Per-job latency is unchanged — the win is
+throughput. Job success must stay 100 % (no double-processing).
+
+## What changed (the implementation under test)
+
+- **Bounded thread pool.** The consumer now owns a
+  `ThreadPoolExecutor(max_workers=MAX_CONCURRENT_JOBS)`. `consume()` reads only up
+  to the current **free capacity** (`MAX_CONCURRENT_JOBS − in-flight`) and submits
+  each message to the pool; at capacity it returns immediately (no blocking read
+  held open while full). (`worker/consumer/consumer.py`, `main.py`)
+  - **Why threads, not processes:** per-job work is I/O + subprocess heavy —
+    object-store download/upload (releases GIL), ffmpeg via `subprocess` (true
+    parallelism), Pillow (releases GIL for most ops), psycopg (I/O). Threads give
+    real concurrency here while sharing one thread-safe `psycopg_pool` and one set
+    of (thread-safe) OTel instruments. A process pool would force per-process
+    DB/Redis pools, pickling the storage client, and per-process OTel init.
+    **GIL escalation path** (documented in the module): if profiling later shows
+    GIL-bound Python sections dominate, move only the transform stage to a
+    `ProcessPoolExecutor` (hybrid), not the whole consumer.
+- **Invariants preserved.** Per-`msg_id` ack (each task acks only its own message
+  on success; failures stay in the PEL); the `SELECT … FOR UPDATE` job claim and
+  `status == 'done'` short-circuit are untouched; `_handle_job` still owns the
+  asset `failed`/`ready` transition (DEV-34); each task starts its **own**
+  `worker.consume` span with that message's extracted `traceparent` (no shared
+  spans); per-task metrics (`record_consume`/`record_job`/`record_asset`), no
+  `asset_id` on any metric label.
+- **Bounded shutdown drain.** On SIGTERM the loop stops reading and
+  `consumer.shutdown(timeout=SHUTDOWN_DRAIN_TIMEOUT, default 30 s)` waits for
+  in-flight jobs, then stops. Anything still running is abandoned and safely
+  reclaimed by recovery (below). Keep the timeout ≤ the container
+  `stop_grace_period`.
+- **DB pool scales with concurrency.** `PgPool` is now sized
+  `MAX_CONCURRENT_JOBS + 2`; each in-flight job holds at most one connection, so
+  the pool no longer silently caps concurrency. (`worker/consumer/db.py`, `main.py`)
+- **XAUTOCLAIM recovery.** The old DB-scan + `XADD` requeue is replaced by
+  `XAUTOCLAIM` on `media:jobs` / `worker-group`: messages idle past
+  `RECOVERY_MIN_IDLE_MS` (default 120 000) are reclaimed from dead consumers and
+  re-dispatched through the same bounded pool, capped at free capacity.
+- **Dead-letter stream.** Permanent failures (non-retryable, or attempts ≥
+  `max_retries`) `XADD` to `media:jobs:dlq` with failure metadata and `XACK` the
+  original (previously left unacked and reclaimed forever). A message reclaimed
+  more times than `max_retries` is also dead-lettered. DLQ depth is exposed as the
+  `mpiper.dlq.depth` observable gauge with a panel on **Queue Health**.
+
+## Setup (record this with every run)
+
+- **A/B via env knobs on `docker-compose.loadtest.yml`** (no new overlays, same
+  binary): `WORKER_CPUS=4` on **both** sides (give the pool real cores), vary
+  `MAX_CONCURRENT_JOBS` (1 = serial baseline → 4/8 = concurrent).
+  `TRACE_SAMPLING_RATE=1.0`. API = 1.0 CPU / 512 MB.
+- **Stack:** core + observability + loadtest + webhooks overlays.
+- **Workload:** images, unique bytes per iteration (dedup defeated). 3 webp
+  variants per asset.
+- **Measurement:** `./loadtest/run.sh closed --vus 20 --duration 2m` to apply a
+  saturating load, then μ = Δ`jobs.status='done'` over 30 s while draining (clean
+  steady-state, free of restart-ramp and API contention). `./loadtest/run.sh
+  capture "<label>"` + `docker stats` for the supporting signals.
+
+> **Why not the 1-CPU pin from 0001:** at 1 CPU, thread concurrency overlaps I/O
+> waits but cannot exceed one core of CPU work, so even a perfect fix looks flat.
+> This A/B uses `WORKER_CPUS=4` on both sides and the `closed` (max-throughput)
+> model so the μ scaling is actually observable. Record the CPU/mem limits with
+> every run — they set the ceiling.
+
+## Method (the loop)
+
+1. Re-run the exact 0001 profile. 2. USE: is the worker now using all allotted
+cores / are slots saturated rather than idle? 3. Is queue depth stabilising
+instead of growing? 4. Open a Tempo trace — multiple `worker.consume` spans
+should overlap in time. 5. Verify job success = 100 % via DB job counts + dedup.
+
+## Results (before — single-threaded, from 0001)
+
+| Signal | Baseline | Source |
+|--------|----------|--------|
+| Worker service rate μ | **~1.13 jobs/s** | `rate(mpiper_mpiper_job_processing_success_total[2m])` |
+| Worker CPU | 98.5 % (1 core, pegged) | `docker stats` |
+| Queue depth | 3985 → 4370 (↑, unbounded) | `sli:queue_depth:current` |
+| Mean asset processing time | 0.81 s | `…_duration_seconds_sum / …_count` |
+| Job success rate | 1.0 | `sli:job_success_ratio:ratio_rate5m` |
+
+## Results (after — bounded pool) — MEASURED 2026-06-30
+
+Controlled A/B on the **same binary**, varying only the concurrency knob at a
+fixed core budget (`WORKER_CPUS=4`). μ measured as **steady-state jobs completed
+per second** while draining a backlog (counting `jobs.status='done'` over 30 s) —
+this avoids the rate-window contamination from worker restarts and isolates pure
+worker throughput from API contention.
+
+| Config | μ (jobs/s) | Worker CPU | vs serial | Notes |
+|--------|-----------:|-----------:|----------:|-------|
+| **BEFORE** `MAX_CONCURRENT_JOBS=1` | **0.73** | ~92 % (**1 core**) | 1.0× | serial baseline |
+| `MAX_CONCURRENT_JOBS=8` | 1.33 | ~406 % (**4 cores**) | 1.8× | oversubscribed; MEM pegged at 1 GB cap |
+| `MAX_CONCURRENT_JOBS=8`, 4 GB | 1.33 | ~406 % | 1.8× | memory wasn't the limit (used ~1 GB) |
+| **AFTER** `MAX_CONCURRENT_JOBS=4` | **1.73** | ~319 % (**~3.2 cores**) | **2.37×** | sweet spot at this core budget |
+
+**Reading:** concurrency unambiguously works — the worker went from pegging a
+**single core (92 %)** to using **3–4 cores**, and steady-state throughput rose
+**2.37×** (0.73 → 1.73 jobs/s). DB connection waits stayed **0** (pool sizing
+holds), and job success stayed 100 % (no double-processing).
+
+Two findings the load test surfaced that the unit tests could not:
+
+1. **Tune `MAX_CONCURRENT_JOBS` near the core count, not arbitrarily high.**
+   `mcj=8` on a 4-core budget *oversubscribed* — 8 Python threads contending for
+   4 cores pushed per-job CPU cost up (~3.0 core-s/job vs ~1.3 serial) and
+   throughput *down* vs `mcj=4` (1.33 vs 1.73 jobs/s). This is the documented GIL
+   tradeoff made concrete: image work is partly GIL-bound, so beyond ~1 thread
+   per core the contention overhead outweighs the gain.
+2. **Memory headroom matters.** At `mcj=8` the worker pegged the 1 GB cap
+   (`MEM=1023/1024 MiB`); raising to 4 GB removed the cap pressure (used ~1 GB).
+   Size worker memory to the pool, not the single-threaded baseline.
+
+The 2.37× (not 4×) gain reflects that per-job image work is only partly
+parallelisable under the GIL — exactly the escalation signal noted in the module
+docstring: if higher scaling is needed, move the transform stage to a process
+pool. For the current workload, `mcj ≈ cores` with adequate memory is the win.
+
+## Trace evidence — confirmed
+
+Worker CPU jumping from ~92 % (one core) to ~320–406 % (multiple cores) under the
+same load confirms multiple `worker.consume` tasks executing in parallel. Each
+task starts its own span with its message's `traceparent` (verified by
+`test_consumer_tracing.py` under async dispatch).
+
+## Recovery & DLQ demos — DLQ confirmed live
+
+- **DLQ (confirmed):** during the runs a permanently-failing job (a stale job
+  whose asset row had been removed → non-retryable FK violation) was routed to
+  `media:jobs:dlq` and acked. `XLEN media:jobs:dlq` = 1, and `XRANGE` shows the
+  full metadata:
+  `{job_id, asset_id, error="…image_asset_id_fkey…", attempts, original_msg_id,
+  failed_at}`. The old behaviour would have left it unacked and reclaimed it
+  forever; it is now parked for inspection/replay and visible on the Queue Health
+  DLQ-depth panel.
+- **Reclaim:** covered by `test_consumer_recovery.py` (`XAUTOCLAIM` reclaim +
+  redispatch). A live demo needs a consumer killed mid-job and a wait past
+  `RECOVERY_MIN_IDLE_MS` (120 s); not run here.
+
+## Conclusion
+
+The bounded worker pool is a clear, measured win: **2.37× steady-state
+throughput** and **multi-core utilisation** (1 → ~3.2 cores) on the same binary
+under the same load, with 0 DB pool waits and 100 % job success. The DLQ works in
+production conditions (a real poison message landed with metadata). Two
+operational lessons: set `MAX_CONCURRENT_JOBS ≈ cores` (oversubscription hurt at
+`mcj=8`) and give the worker memory headroom proportional to the pool.
+
+## Reproduce
+
+```bash
+docker compose -f docker-compose.yml -f docker-compose.observability.yml \
+  -f docker-compose.loadtest.yml up -d --build
+./loadtest/run.sh open --rate 10/s --duration 90s
+
+# Worker throughput + saturation:
+#   Grafana → MPiper → "Worker / App Saturation (USE)" and "Pipeline Funnel"
+# Job success ground truth:
+docker exec mpiper-postgres psql -U mpiper -d mpiper -c \
+  "SELECT status, count(*) FROM jobs GROUP BY status;"
+# Redis stream / recovery / DLQ:
+docker exec mpiper-redis redis-cli XINFO GROUPS media:jobs
+docker exec mpiper-redis redis-cli XPENDING media:jobs worker-group
+docker exec mpiper-redis redis-cli XLEN media:jobs:dlq
+```
+
+> **Histogram-bucket caveat:** the worker duration histograms and the API
+> `db.query.duration` view changed bucket boundaries in this work. When reading
+> p95 across a window that spans the deploy, reset Prometheus data or wait for the
+> old series to age out before trusting `histogram_quantile`.
+
+## Tests backing this change
+
+- `worker/tests/test_consumer_pool.py` — free-capacity read cap + in-flight cap;
+  failed task leaves message unacked; malformed message acked by `msg_id`; bounded
+  drain waits for in-flight.
+- `worker/tests/test_consumer_recovery.py` — `XAUTOCLAIM` reclaim + redispatch
+  (asserts `min_idle_time`/`consumername`/`count`), skip when no free capacity, ack
+  tombstoned entries; periodic-recovery cadence preserved.
+- `worker/tests/test_consumer_retry.py` — permanent failure → DLQ (`XADD`+`XACK`);
+  retryable failure → left unacked (no DLQ).
+- `worker/tests/test_consumer_tracing.py` — per-task `worker.consume` span still
+  continues the producer trace under async dispatch.
+- `worker/tests/test_db_pool.py` — `PgPool` honours the configured `max_size`.
+- All 31 worker unit tests pass in-container
+  (`docker run --rm --entrypoint python … -m unittest discover -s worker/tests`).
diff --git a/experiments/0003-webhook-throughput.md b/experiments/0003-webhook-throughput.md
new file mode 100644
index 0000000..2809770
--- /dev/null
+++ b/experiments/0003-webhook-throughput.md
@@ -0,0 +1,159 @@
+# Experiment 0003 — Webhook delivery throughput
+
+**Date:** 2026-06-30 · **Track:** 1b (webhook throughput) · **Follows:** 0001
+**Status:** implementation complete; **after-load numbers pending a live run** (see *Results (after)*).
+
+## Hypothesis
+
+The webhook dispatcher delivers serially: `tick()` claims a batch with
+`FOR UPDATE … SKIP LOCKED LIMIT BatchSize` and then loops
+`for _, row := range rows { d.deliver(ctx, row) }`, where each `deliver()` is a
+synchronous HTTP POST bounded by `WEBHOOK_TIMEOUT` (10 s). With a 2 s poll and a
+batch of 50, the *best-case* drain rate is `BatchSize / PollInterval` only if
+each POST is instant; in practice one slow receiver stalls the whole batch.
+Under the 0001 load profile (each asset emits `job.starting → job.started →
+job.done`), `webhook_pending` grows without bound and never drains.
+
+Delivering the batch **concurrently** (bounded pool of `WEBHOOK_CONCURRENCY`)
+should make the drain rate scale with the pool size until the receiver or the DB
+becomes the limit, so `webhook_pending` returns to ~0.
+
+## What changed (the implementation under test)
+
+- **Concurrent delivery.** `tick()` now fans the claimed batch out across an
+  `errgroup.Group` with `SetLimit(WEBHOOK_CONCURRENCY)` — one goroutine per
+  delivery, at most `WEBHOOK_CONCURRENCY` in flight. `handleFailure`/`backoff`/
+  `markFailed` are unchanged and keyed by the row's own id, so concurrent
+  delivery is race-free. (`internal/webhook/dispatcher.go`)
+- **HTTP transport tuning.** The dispatcher's `http.Client` now uses a custom
+  `http.Transport` with `MaxIdleConnsPerHost = MaxConnsPerHost = WEBHOOK_CONCURRENCY`.
+  Go's default `MaxIdleConnsPerHost` is 2, which would serialize TLS handshakes
+  for N concurrent POSTs to one receiver and inflate p95 — the tuning lets
+  concurrent deliveries to the same host reuse connections.
+- **Delivery metrics wired.** `WebhookDeliveryTotal`, `WebhookDeliveryDuration`,
+  and `WebhookDeliveryFailures` are now recorded per delivery (labels: `event`,
+  `status` ∈ {delivered, failed, error}; never `asset_id`). `NewDispatcher` takes
+  `*metrics.Metrics`, passed from `cmd/server/main.go`. The
+  `sli:webhook_delivery_latency_seconds:p95` recording rule already existed and
+  now has a histogram to read.
+- **Config.** `WEBHOOK_CONCURRENCY` (default **10**) added to `WebhookConfig` /
+  `internal/config/env.go`.
+- **Concurrency safety note (documented in code):** the `SKIP LOCKED` claim runs
+  outside an explicit transaction, so locks release when the SELECT returns. That
+  is safe for a single dispatcher fanning out to internal goroutines (each row is
+  claimed once), but NOT for >1 dispatcher process. Scaling past one dispatcher
+  requires wrapping the claim in a tx or adding a `claimed_at`/`locked_by` column.
+
+## Setup (record this with every run)
+
+- **Resource pinning** (`docker-compose.loadtest.yml`): `api` = 1.0 CPU / 512 MB,
+  `worker` = 1.0 CPU / 1 GB. `TRACE_SAMPLING_RATE=1.0`.
+- **Stack:** core + observability + webhooks overlays + loadtest pins, all up.
+- **Webhook receiver:** the bundled receiver (see `docker-compose.webhooks.yml`),
+  reachable from the API container; one registration subscribed to all four
+  `job.*` events.
+- **Profile:** open model, `./loadtest/run.sh open --rate 10/s --duration 90s`
+  (same arrival rate as 0001). Each asset produces 3 webhook deliveries.
+
+> Local results are **relative** — trust the `webhook_pending` drain and the
+> before/after delta, not absolute throughput.
+
+## Method (the loop)
+
+1. Confirm `webhook_pending` is climbing under load (the bottleneck).
+2. Read `sli:webhook_delivery_latency_seconds:p95` and the delivery-rate panel.
+3. Switch concurrency on (this change), re-run the **same** profile, compare the
+   `webhook_pending` trajectory and the delivery rate.
+
+## Results (before — serial dispatcher, from 0001-era observation)
+
+| Signal | Value | Source |
+|--------|-------|--------|
+| `webhook_pending` peak | **~5,901, never drains** | `sli:webhook_pending:current` |
+| Delivery rate | bounded by serial POSTs | `rate(mpiper_webhook_delivery_total[5m])` *(was unrecorded before this change)* |
+| Delivery p95 | unreadable (histogram unrecorded) | `sli:webhook_delivery_latency_seconds:p95` |
+
+## Results (after — concurrent dispatcher) — MEASURED 2026-06-30
+
+A/B on the same binary, `WEBHOOK_CONCURRENCY=1` vs `10`, under closed-model load
+(20 and 40 VUs) with the bundled `http-https-echo` receiver and one registration
+subscribed to all four `job.*` events.
+
+| Config | `webhook_pending` under load | Reading |
+|--------|------------------------------|---------|
+| `WEBHOOK_CONCURRENCY=1` (serial) | stayed **~0–7** | dispatcher kept up |
+| `WEBHOOK_CONCURRENCY=10` | **~0** | dispatcher kept up |
+
+**Honest reading: webhook delivery was *not* the bottleneck at reproducible local
+scale.** With a fast local receiver and the API pinned to 1 CPU, the achievable
+event-generation rate (`job.starting` at the upload rate + worker `job.started`/
+`job.done`) stayed under the *serial* dispatcher's ceiling (≈ `BatchSize/PollInterval`
+= 50/2 s ≈ 25/s), so even `WEBHOOK_CONCURRENCY=1` drained `webhook_pending` to ~0.
+The dramatic 0001 backlog (~5,901) was not reproduced here — it requires either a
+**slow/realistic receiver** (real endpoints have 50–500 ms latency) or a
+generation rate above the serial ceiling, neither of which this local rig
+produces once the worker (not the dispatcher) is the throughput limiter.
+
+What *was* delivered and verified:
+- **Metrics now wired** — `webhook.delivery.total/duration/failures` record per
+  delivery (labels `event`,`status`), so the dispatcher is now observable at all
+  (previously a blind spot). The `sli:webhook_delivery_latency_seconds:p95` rule
+  finally has a histogram.
+- **Concurrency proven** by `TestDispatcher_DeliversConcurrently` (integration):
+  20 deliveries at concurrency 5 run with max-in-flight ∈ [2,5] and all complete —
+  i.e. the headroom is real and kicks in precisely when a slow receiver or a burst
+  pushes generation past the serial ceiling.
+- **Transport tuning** (`MaxIdleConnsPerHost=WEBHOOK_CONCURRENCY`) removes the
+  default 2-connection cap that would otherwise serialise TLS to one host.
+
+> **Follow-up to make the contrast visible:** add artificial receiver latency
+> (e.g. a 200 ms sleep in the echo handler). At 200 ms/POST the serial ceiling
+> drops to ~5/s and `webhook_pending` backs up under load, where `WEBHOOK_CONCURRENCY=10`
+> drains it ~10×. That is the scenario this change is for; the instant local
+> receiver hides it.
+
+## Conclusion
+
+At local scale with an instant receiver, the **worker** is the binding
+constraint and the webhook dispatcher keeps up serially — so this change is
+**insurance + observability** rather than a measured throughput win *here*: the
+delivery metrics are now recorded (it was previously unmonitored), and the
+bounded-concurrency fan-out (proven by the integration test) provides the
+headroom that matters the moment a real, latency-bearing receiver or a burst
+exceeds the serial ceiling. The honest result is "no regression, now observable,
+with headroom" — not the 0001-style drain, which this rig can't reproduce without
+a slow receiver.
+
+## Reproduce
+
+```bash
+docker compose -f docker-compose.yml -f docker-compose.observability.yml \
+  -f docker-compose.webhooks.yml -f docker-compose.loadtest.yml up -d --build
+# (register a webhook subscribed to job.* against the bundled receiver)
+./loadtest/run.sh open --rate 10/s --duration 90s
+
+# Backlog drain (DB-side, ground truth):
+docker exec mpiper-postgres psql -U mpiper -d mpiper -c \
+  "SELECT status, count(*) FROM webhook_deliveries GROUP BY status;"
+
+# Grafana http://localhost:3000 → MPiper → "Queue Health":
+#   - "Webhook delivery p95 (SLO < 10s) + pending": pending should fall to ~0
+# Prometheus :9090:
+#   sum(rate(mpiper_webhook_delivery_total[5m]))           # delivery throughput
+#   histogram_quantile(0.95, sum by (le) (rate(mpiper_webhook_delivery_duration_seconds_bucket[5m])))
+```
+
+> **Histogram-bucket caveat:** the delivery histogram is newly recorded. When
+> reading p95 over a window that spans the deploy, either reset Prometheus data
+> or wait for the pre-change (empty) series to age out, so mixed/empty buckets
+> don't distort `histogram_quantile`.
+
+## Tests backing this change
+
+- `internal/webhook/dispatcher_test.go` — `TestRecordDelivery_EmitsMetrics`
+  (counter/failure/duration recorded with correct labels), `…_NilMetricsIsSafe`.
+- `internal/webhook/dispatcher_integration_test.go` (`-tags integration`,
+  testcontainers Postgres) — `TestDispatcher_DeliversConcurrently`: 20 deliveries
+  at concurrency 5 asserts max-in-flight ∈ [2, 5], all rows delivered, and the
+  delivery metric counts all 20; the success test also asserts a `delivered`
+  metric was recorded.
diff --git a/go.mod b/go.mod
index f828092..f47c719 100644
--- a/go.mod
+++ b/go.mod
@@ -126,7 +126,7 @@ require (
 	go.uber.org/multierr v1.11.0 // indirect
 	golang.org/x/net v0.53.0 // indirect
 	golang.org/x/oauth2 v0.33.0 // indirect
-	golang.org/x/sync v0.20.0 // indirect
+	golang.org/x/sync v0.20.0
 	golang.org/x/sys v0.45.0 // indirect
 	golang.org/x/text v0.37.0 // indirect
 	golang.org/x/time v0.14.0
diff --git a/internal/config/env.go b/internal/config/env.go
index 05628af..06e9517 100644
--- a/internal/config/env.go
+++ b/internal/config/env.go
@@ -94,6 +94,17 @@ type WebhookConfig struct {
 	Timeout      time.Duration
 	MaxAttempts  int
 	Retention    time.Duration
+	Concurrency  int
+}
+
+// QuotaConfig holds per-tenant rate-limit and usage-quota settings.
+type QuotaConfig struct {
+	// RateLimitRPS is the sustained per-tenant request rate (requests/second).
+	RateLimitRPS float64
+	// RateLimitBurst is the per-tenant token-bucket burst size.
+	RateLimitBurst int
+	// AssetQuota is the maximum number of assets a tenant may own. 0 = unlimited.
+	AssetQuota int64
 }
 
 type EnvConfig struct {
@@ -105,11 +116,22 @@ type EnvConfig struct {
 	Storage            StorageConfig
 	Outbox             OutboxConfig
 	Webhook            WebhookConfig
+	Quota              QuotaConfig
 	CORSAllowedOrigins []string
 	LogLevel           string
 	EncryptionKey      string
-	AutoMigrate        bool
-	MaxAssetSizeBytes  int64
+	// WebhookEncryptionKey encrypts webhook secrets at rest, separate from the
+	// auth/EncryptionKey so a leak of one does not compromise the other. Falls
+	// back to EncryptionKey when WEBHOOK_ENCRYPTION_KEY is unset.
+	WebhookEncryptionKey string
+	AutoMigrate          bool
+	// MigrationAllowDestructive gates migration versions 7 and 8 which drop
+	// or alter existing user data. Defaults to false; must be set to true
+	// explicitly on first bootstrap of a fresh database.
+	MigrationAllowDestructive bool
+	MaxAssetSizeBytes         int64
+	// IdempotencyTTL is how long a stored idempotency key/response is replayable.
+	IdempotencyTTL time.Duration
 }
 
 // --- Singleton ---
@@ -184,6 +206,17 @@ func GetEnvConfig(envFile string) (EnvConfig, error) {
 		)
 	}
 
+	// Webhook secrets are encrypted with their own key, separate from the auth
+	// key. When unset, fall back to ENCRYPTION_KEY for backward compatibility.
+	webhookEncryptionKey := os.Getenv("WEBHOOK_ENCRYPTION_KEY")
+	if webhookEncryptionKey == "" {
+		webhookEncryptionKey = encryptionKey
+	} else if len(webhookEncryptionKey) != 32 {
+		return EnvConfig{}, NewInitializationError(
+			fmt.Sprintf("WEBHOOK_ENCRYPTION_KEY must be exactly 32 bytes for AES-256, got %d", len(webhookEncryptionKey)), nil,
+		)
+	}
+
 	traceSamplingRate := 0.1
 	if raw := os.Getenv("TRACE_SAMPLING_RATE"); raw != "" {
 		if parsed, err := strconv.ParseFloat(raw, 64); err == nil {
@@ -198,6 +231,32 @@ func GetEnvConfig(envFile string) (EnvConfig, error) {
 		}
 	}
 
+	idempotencyTTL := 24 * time.Hour
+	if raw := os.Getenv("IDEMPOTENCY_TTL"); raw != "" {
+		if d, err := time.ParseDuration(raw); err == nil && d > 0 {
+			idempotencyTTL = d
+		}
+	}
+
+	tenantRateRPS := 10.0
+	if raw := os.Getenv("TENANT_RATE_LIMIT_RPS"); raw != "" {
+		if f, err := strconv.ParseFloat(raw, 64); err == nil && f > 0 {
+			tenantRateRPS = f
+		}
+	}
+	tenantRateBurst := 20
+	if raw := os.Getenv("TENANT_RATE_LIMIT_BURST"); raw != "" {
+		if n, err := strconv.Atoi(raw); err == nil && n > 0 {
+			tenantRateBurst = n
+		}
+	}
+	tenantAssetQuota := int64(0) // 0 = unlimited
+	if raw := os.Getenv("TENANT_ASSET_QUOTA"); raw != "" {
+		if n, err := strconv.ParseInt(raw, 10, 64); err == nil && n >= 0 {
+			tenantAssetQuota = n
+		}
+	}
+
 	corsOrigins := []string{"http://localhost:5173"}
 	if raw := os.Getenv("CORS_ALLOWED_ORIGINS"); raw != "" {
 		corsOrigins = strings.Split(raw, ",")
@@ -258,6 +317,12 @@ func GetEnvConfig(envFile string) (EnvConfig, error) {
 			webhookRetention = d
 		}
 	}
+	webhookConcurrency := 10
+	if raw := os.Getenv("WEBHOOK_CONCURRENCY"); raw != "" {
+		if n, err := strconv.Atoi(raw); err == nil && n > 0 {
+			webhookConcurrency = n
+		}
+	}
 
 	return EnvConfig{
 		Environment: env,
@@ -299,11 +364,14 @@ func GetEnvConfig(envFile string) (EnvConfig, error) {
 				PublicEndpointURL: os.Getenv("S3_PUBLIC_ENDPOINT_URL"),
 			},
 		},
-		CORSAllowedOrigins: corsOrigins,
-		LogLevel:           envOr("LOG_LEVEL", "INFO"),
-		EncryptionKey:      encryptionKey,
-		AutoMigrate:        strings.ToLower(os.Getenv("AUTO_MIGRATE")) == "true",
-		MaxAssetSizeBytes:  maxAssetSize,
+		CORSAllowedOrigins:   corsOrigins,
+		LogLevel:             envOr("LOG_LEVEL", "INFO"),
+		EncryptionKey:        encryptionKey,
+		WebhookEncryptionKey: webhookEncryptionKey,
+		AutoMigrate:          strings.ToLower(os.Getenv("AUTO_MIGRATE")) == "true",
+		MigrationAllowDestructive: strings.ToLower(os.Getenv("MIGRATION_ALLOW_DESTRUCTIVE")) == "true",
+		MaxAssetSizeBytes:         maxAssetSize,
+		IdempotencyTTL:       idempotencyTTL,
 		Outbox: OutboxConfig{
 			RelayInterval: outboxRelayInterval,
 			RelayBatch:    outboxRelayBatch,
@@ -316,6 +384,12 @@ func GetEnvConfig(envFile string) (EnvConfig, error) {
 			Timeout:      webhookTimeout,
 			MaxAttempts:  webhookMaxAttempts,
 			Retention:    webhookRetention,
+			Concurrency:  webhookConcurrency,
+		},
+		Quota: QuotaConfig{
+			RateLimitRPS:   tenantRateRPS,
+			RateLimitBurst: tenantRateBurst,
+			AssetQuota:     tenantAssetQuota,
 		},
 	}, nil
 }
diff --git a/internal/database/migrate.go b/internal/database/migrate.go
index a05413c..c6ea847 100644
--- a/internal/database/migrate.go
+++ b/internal/database/migrate.go
@@ -9,14 +9,34 @@ import (
 	"github.com/golang-migrate/migrate/v4"
 	"github.com/golang-migrate/migrate/v4/database/postgres"
 	"github.com/golang-migrate/migrate/v4/source/iofs"
+	"github.com/lib/pq"
 )
 
 //go:embed migrations/*.sql
 var migrationFiles embed.FS
 
-// RunMigrations applies all pending up-migrations. Idempotent — safe to call on
-// every startup when AUTO_MIGRATE=true.
-func RunMigrations(db *sql.DB) error {
+// destructiveMigrations lists schema migration versions that drop or alter
+// existing user data. They are gated behind an explicit opt-in flag so a
+// fresh database can bootstrap safely while production databases with
+// existing data are protected from accidental data loss.
+var destructiveMigrations = []uint{7, 8}
+
+// RunMigrations applies all pending up-migrations. Idempotent — safe to call
+// on every startup when AUTO_MIGRATE=true. Destructive migrations (versions
+// 7 and 8) require allowDestructive=true; otherwise the function returns an
+// error and refuses to apply anything.
+func RunMigrations(db *sql.DB, allowDestructive bool) error {
+	pending, err := destructiveMigrationsPending(db)
+	if err != nil {
+		return fmt.Errorf("check destructive migrations: %w", err)
+	}
+	if len(pending) > 0 && !allowDestructive {
+		return fmt.Errorf(
+			"destructive migrations %v are pending. Set MIGRATION_ALLOW_DESTRUCTIVE=true to apply them",
+			pending,
+		)
+	}
+
 	src, err := iofs.New(migrationFiles, "migrations")
 	if err != nil {
 		return fmt.Errorf("migration source: %w", err)
@@ -38,3 +58,59 @@ func RunMigrations(db *sql.DB) error {
 
 	return nil
 }
+
+// destructiveMigrationsPending returns the subset of destructiveMigrations
+// that have not yet been recorded in schema_migrations. If the
+// schema_migrations table does not exist yet (fresh database before migrate
+// has been initialised), every destructive version is reported pending.
+func destructiveMigrationsPending(db *sql.DB) ([]uint, error) {
+	applied, err := loadAppliedMigrations(db)
+	if err != nil {
+		return nil, err
+	}
+	var pending []uint
+	for _, v := range destructiveMigrations {
+		if !applied[v] {
+			pending = append(pending, v)
+		}
+	}
+	return pending, nil
+}
+
+// loadAppliedMigrations returns the set of migration versions already
+// recorded in schema_migrations. The table does not exist on a fresh
+// database before migrate has been initialised; in that case we return an
+// empty set so the caller treats all migrations as pending.
+func loadAppliedMigrations(db *sql.DB) (map[uint]bool, error) {
+	applied := make(map[uint]bool)
+	rows, err := db.Query("SELECT version FROM schema_migrations")
+	if err != nil {
+		if isUndefinedTableErr(err) {
+			return applied, nil
+		}
+		return nil, fmt.Errorf("query schema_migrations: %w", err)
+	}
+	defer rows.Close()
+	for rows.Next() {
+		var v int64
+		if err := rows.Scan(&v); err != nil {
+			return nil, fmt.Errorf("scan schema_migrations: %w", err)
+		}
+		applied[uint(v)] = true
+	}
+	if err := rows.Err(); err != nil {
+		return nil, fmt.Errorf("iterate schema_migrations: %w", err)
+	}
+	return applied, nil
+}
+
+// isUndefinedTableErr reports whether err is a PostgreSQL "undefined_table"
+// (SQLSTATE 42P01), which is what postgres raises when schema_migrations
+// does not yet exist on a fresh database.
+func isUndefinedTableErr(err error) bool {
+	var pqErr *pq.Error
+	if errors.As(err, &pqErr) {
+		return pqErr.Code == "42P01"
+	}
+	return false
+}
diff --git a/internal/database/migrations/000006_api_keys.down.sql b/internal/database/migrations/000006_api_keys.down.sql
new file mode 100644
index 0000000..3a3da99
--- /dev/null
+++ b/internal/database/migrations/000006_api_keys.down.sql
@@ -0,0 +1 @@
+DROP TABLE IF EXISTS api_keys;
diff --git a/internal/database/migrations/000006_api_keys.up.sql b/internal/database/migrations/000006_api_keys.up.sql
new file mode 100644
index 0000000..6f3ed75
--- /dev/null
+++ b/internal/database/migrations/000006_api_keys.up.sql
@@ -0,0 +1,17 @@
+-- API keys: the identity source of record for tenants.
+-- A key is presented as mp_<prefix>_<secret>; only its SHA-256 hash is stored.
+-- tenant_id maps 1:1 to assets.owner_id / webhook_registrations.user_id.
+CREATE TABLE IF NOT EXISTS api_keys (
+    id         UUID        PRIMARY KEY DEFAULT uuid_generate_v4(),
+    tenant_id  TEXT        NOT NULL,
+    key_hash   TEXT        NOT NULL UNIQUE,
+    prefix     TEXT        NOT NULL,
+    scopes     JSONB       NOT NULL DEFAULT '[]'::jsonb,
+    expires_at TIMESTAMPTZ,
+    revoked_at TIMESTAMPTZ,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_api_keys_prefix ON api_keys (prefix);
+CREATE INDEX IF NOT EXISTS idx_api_keys_key_hash ON api_keys (key_hash);
+CREATE INDEX IF NOT EXISTS idx_api_keys_tenant ON api_keys (tenant_id);
diff --git a/internal/database/migrations/000007_split_webhook_key.down.sql b/internal/database/migrations/000007_split_webhook_key.down.sql
new file mode 100644
index 0000000..dbb9956
--- /dev/null
+++ b/internal/database/migrations/000007_split_webhook_key.down.sql
@@ -0,0 +1 @@
+-- No-op: truncated webhook registrations cannot be restored.
diff --git a/internal/database/migrations/000007_split_webhook_key.up.sql b/internal/database/migrations/000007_split_webhook_key.up.sql
new file mode 100644
index 0000000..29e93af
--- /dev/null
+++ b/internal/database/migrations/000007_split_webhook_key.up.sql
@@ -0,0 +1,6 @@
+-- Webhook secrets move from the shared ENCRYPTION_KEY to a dedicated
+-- WEBHOOK_ENCRYPTION_KEY. The project is pre-launch/local, so rather than
+-- re-encrypting existing rows we drop them: any secret still encrypted with the
+-- old key would fail to decrypt under the new key. CASCADE also clears the
+-- dependent webhook_deliveries rows.
+TRUNCATE webhook_registrations CASCADE;
diff --git a/internal/database/migrations/000008_assets_owner_not_null.down.sql b/internal/database/migrations/000008_assets_owner_not_null.down.sql
new file mode 100644
index 0000000..7804497
--- /dev/null
+++ b/internal/database/migrations/000008_assets_owner_not_null.down.sql
@@ -0,0 +1,3 @@
+DROP INDEX IF EXISTS idx_assets_owner;
+
+ALTER TABLE assets ALTER COLUMN owner_id DROP NOT NULL;
diff --git a/internal/database/migrations/000008_assets_owner_not_null.up.sql b/internal/database/migrations/000008_assets_owner_not_null.up.sql
new file mode 100644
index 0000000..2d13eed
--- /dev/null
+++ b/internal/database/migrations/000008_assets_owner_not_null.up.sql
@@ -0,0 +1,10 @@
+-- Tenant isolation: owner_id becomes a required, indexed column.
+-- The project is pre-launch/local, so existing assets (which have a nullable,
+-- possibly-NULL owner_id and are unreachable under the new owner-scoped queries)
+-- are deleted rather than backfilled. Deleting assets cascades to dependent
+-- variants.image / variants.video / jobs (all ON DELETE CASCADE).
+DELETE FROM assets;
+
+ALTER TABLE assets ALTER COLUMN owner_id SET NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_assets_owner ON assets (owner_id);
diff --git a/internal/database/migrations/000009_idempotency_keys.down.sql b/internal/database/migrations/000009_idempotency_keys.down.sql
new file mode 100644
index 0000000..1cf0d86
--- /dev/null
+++ b/internal/database/migrations/000009_idempotency_keys.down.sql
@@ -0,0 +1 @@
+DROP TABLE IF EXISTS idempotency_keys;
diff --git a/internal/database/migrations/000009_idempotency_keys.up.sql b/internal/database/migrations/000009_idempotency_keys.up.sql
new file mode 100644
index 0000000..cb734c3
--- /dev/null
+++ b/internal/database/migrations/000009_idempotency_keys.up.sql
@@ -0,0 +1,20 @@
+-- Idempotency keys: Stripe-style full-response replay, scoped per tenant.
+-- The first request for a (tenant_id, key) inserts a 'pending' row (the unique
+-- PK acts as a lock); once the handler completes, the response is stored and the
+-- row flips to 'done'. Replays within the TTL return the stored response.
+CREATE TABLE IF NOT EXISTS idempotency_keys (
+    tenant_id           TEXT        NOT NULL,
+    key                 TEXT        NOT NULL,
+    request_fingerprint TEXT        NOT NULL,   -- sha256(method+path+body)
+    status              TEXT        NOT NULL DEFAULT 'pending',  -- pending | done
+    response_status     INT,
+    response_body       JSONB,
+    asset_id            UUID,
+    created_at          TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    expires_at          TIMESTAMPTZ NOT NULL,
+
+    PRIMARY KEY (tenant_id, key)
+);
+
+-- Supports the background TTL sweep.
+CREATE INDEX IF NOT EXISTS idx_idempotency_keys_expires ON idempotency_keys (expires_at);
diff --git a/internal/handler/asset_handler.go b/internal/handler/asset_handler.go
index 8c92521..187c90c 100644
--- a/internal/handler/asset_handler.go
+++ b/internal/handler/asset_handler.go
@@ -104,7 +104,7 @@ func (h *AssetHandler) CreateAsset(w http.ResponseWriter, r *http.Request) {
 		if h.m != nil {
 			h.m.AssetProcessingFailed.Add(timeoutCtx, 1)
 		}
-		utils.RespondJSON(w, map[string]string{"status": "error", "message": "Failed to create asset", "error": err.Error()}, http.StatusInternalServerError)
+		utils.RespondJSON(w, map[string]string{"status": "error", "message": "Failed to create asset"}, http.StatusInternalServerError)
 		return
 	}
 	if res == nil {
@@ -151,7 +151,9 @@ func (h *AssetHandler) MarkAssetUploaded(w http.ResponseWriter, r *http.Request)
 		h.logger.Sugar().Errorf("Failed to mark asset uploaded: %v", err)
 		span.RecordError(err)
 		span.SetStatus(codes.Error, "Failed to mark asset uploaded")
-		utils.RespondJSON(w, map[string]string{"status": "error", "message": "Failed to mark asset uploaded", "error": err.Error()}, http.StatusInternalServerError)
+		// Typed app errors (e.g. NotFound for cross-tenant/absent assets) map to
+		// their status; everything else falls back to 500.
+		utils.WriteErrorResponse(w, err)
 		return
 	}
 
diff --git a/internal/handler/asset_handler_test.go b/internal/handler/asset_handler_test.go
new file mode 100644
index 0000000..15f9463
--- /dev/null
+++ b/internal/handler/asset_handler_test.go
@@ -0,0 +1,61 @@
+package handler
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/go-chi/chi/v5"
+	"github.com/google/uuid"
+	"github.com/rndmcodeguy20/mpiper/internal/models"
+	apperrors "github.com/rndmcodeguy20/mpiper/pkg/errors"
+	"go.uber.org/zap"
+)
+
+// fakeAssetService implements service.AssetService for handler tests.
+type fakeAssetService struct {
+	markErr error
+}
+
+func (f *fakeAssetService) CreateAsset(_ context.Context, _ models.UploadAssetRequest) (*models.UploadAssetResponse, error) {
+	return &models.UploadAssetResponse{}, nil
+}
+
+func (f *fakeAssetService) MarkAssetUploaded(_ context.Context, _ uuid.UUID) error {
+	return f.markErr
+}
+
+// serveComplete mounts the handler on a chi router so {assetID} is parsed.
+func serveComplete(h *AssetHandler, assetID string) *httptest.ResponseRecorder {
+	r := chi.NewRouter()
+	r.Get("/api/v1/assets/{assetID}/complete", h.MarkAssetUploaded)
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/api/v1/assets/"+assetID+"/complete", nil)
+	r.ServeHTTP(rec, req)
+	return rec
+}
+
+func TestMarkAssetUploaded_CrossTenantReturns404(t *testing.T) {
+	h := NewAssetHandler(&fakeAssetService{markErr: apperrors.NewNotFoundError("Asset not found", nil)}, zap.NewNop(), nil)
+	rec := serveComplete(h, uuid.New().String())
+	if rec.Code != http.StatusNotFound {
+		t.Errorf("status = %d, want %d", rec.Code, http.StatusNotFound)
+	}
+}
+
+func TestMarkAssetUploaded_Success200(t *testing.T) {
+	h := NewAssetHandler(&fakeAssetService{markErr: nil}, zap.NewNop(), nil)
+	rec := serveComplete(h, uuid.New().String())
+	if rec.Code != http.StatusOK {
+		t.Errorf("status = %d, want %d", rec.Code, http.StatusOK)
+	}
+}
+
+func TestMarkAssetUploaded_InvalidUUID400(t *testing.T) {
+	h := NewAssetHandler(&fakeAssetService{}, zap.NewNop(), nil)
+	rec := serveComplete(h, "not-a-uuid")
+	if rec.Code != http.StatusBadRequest {
+		t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest)
+	}
+}
diff --git a/internal/handler/webhook_handler.go b/internal/handler/webhook_handler.go
index 6d503af..e9e2619 100644
--- a/internal/handler/webhook_handler.go
+++ b/internal/handler/webhook_handler.go
@@ -35,7 +35,7 @@ func (h *WebhookHandler) Create(w http.ResponseWriter, r *http.Request) {
 	reg, err := h.svc.Create(r.Context(), req.URL, req.Secret, req.Events)
 	if err != nil {
 		h.logger.Warn("webhook create failed", zap.Error(err))
-		utils.RespondJSON(w, map[string]string{"status": "error", "message": err.Error()}, http.StatusBadRequest)
+		utils.WriteErrorResponse(w, err)
 		return
 	}
 
@@ -46,7 +46,7 @@ func (h *WebhookHandler) List(w http.ResponseWriter, r *http.Request) {
 	regs, err := h.svc.List(r.Context())
 	if err != nil {
 		h.logger.Error("webhook list failed", zap.Error(err))
-		utils.RespondJSON(w, map[string]string{"status": "error", "message": err.Error()}, http.StatusInternalServerError)
+		utils.WriteErrorResponse(w, err)
 		return
 	}
 	utils.RespondJSON(w, map[string]interface{}{"status": "success", "data": regs}, http.StatusOK)
@@ -61,11 +61,7 @@ func (h *WebhookHandler) Delete(w http.ResponseWriter, r *http.Request) {
 	}
 
 	if err := h.svc.Delete(r.Context(), id); err != nil {
-		status := http.StatusInternalServerError
-		if err.Error() == "not found" {
-			status = http.StatusNotFound
-		}
-		utils.RespondJSON(w, map[string]string{"status": "error", "message": err.Error()}, status)
+		utils.WriteErrorResponse(w, err)
 		return
 	}
 
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 054436e..cbacef5 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -2,6 +2,7 @@ package metrics
 
 import (
 	"context"
+	"database/sql"
 	"runtime"
 	"time"
 
@@ -36,17 +37,25 @@ type Metrics struct {
 	AssetProcessingDuration metric.Float64Histogram
 	AssetSizeBytes          metric.Int64Histogram
 
+	// TenantThrottleTotal counts per-tenant edge rejections. The only label is
+	// a low-cardinality reason (rate_limit | quota) — tenant id is deliberately
+	// excluded to keep metric cardinality bounded.
+	TenantThrottleTotal metric.Int64Counter
+
 	StorageOperationDuration metric.Float64Histogram
 	StorageOperationTotal    metric.Int64Counter
 	StorageOperationErrors   metric.Int64Counter
 
-	DBQueryDuration     metric.Float64Histogram
-	DBQueryTotal        metric.Int64Counter
-	DBQueryErrors       metric.Int64Counter
-	DBConnectionsActive metric.Int64UpDownCounter
-	DBConnectionsIdle   metric.Int64UpDownCounter
-	DBTransactionTotal  metric.Int64Counter
-	DBTransactionErrors metric.Int64Counter
+	DBQueryDuration        metric.Float64Histogram
+	DBQueryTotal           metric.Int64Counter
+	DBQueryErrors          metric.Int64Counter
+	DBConnectionsActive    metric.Int64ObservableGauge
+	DBConnectionsIdle      metric.Int64ObservableGauge
+	DBConnectionsOpen      metric.Int64ObservableGauge
+	DBConnectionsMaxOpen   metric.Int64ObservableGauge
+	DBConnectionsWaitCount metric.Int64ObservableGauge
+	DBTransactionTotal     metric.Int64Counter
+	DBTransactionErrors    metric.Int64Counter
 
 	QueueMessagePublished metric.Int64Counter
 	QueueMessageConsumed  metric.Int64Counter
@@ -83,6 +92,22 @@ func (m *Metrics) RegisterQueueDepthFunc(fn func(context.Context) (int64, error)
 	return err
 }
 
+// RegisterDBStatsFunc wires sql.DBStats (connection-pool stats) to the DB
+// connection gauges. fn typically returns db.Stats(). One callback observes all
+// pool gauges from a single stats snapshot so they stay mutually consistent.
+func (m *Metrics) RegisterDBStatsFunc(fn func() sql.DBStats) error {
+	_, err := m.meter.RegisterCallback(func(_ context.Context, o metric.Observer) error {
+		s := fn()
+		o.ObserveInt64(m.DBConnectionsActive, int64(s.InUse))
+		o.ObserveInt64(m.DBConnectionsIdle, int64(s.Idle))
+		o.ObserveInt64(m.DBConnectionsOpen, int64(s.OpenConnections))
+		o.ObserveInt64(m.DBConnectionsMaxOpen, int64(s.MaxOpenConnections))
+		o.ObserveInt64(m.DBConnectionsWaitCount, s.WaitCount)
+		return nil
+	}, m.DBConnectionsActive, m.DBConnectionsIdle, m.DBConnectionsOpen, m.DBConnectionsMaxOpen, m.DBConnectionsWaitCount)
+	return err
+}
+
 // RegisterOutboxPendingFunc wires a callback to the OutboxPendingGauge.
 func (m *Metrics) RegisterOutboxPendingFunc(fn func(context.Context) (int64, error)) error {
 	_, err := m.meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
@@ -146,7 +171,9 @@ func InitMetrics(ctx context.Context, logger *zap.Logger) (*Metrics, func(contex
 
 	mp := sdkmetric.NewMeterProvider(
 		sdkmetric.WithResource(res),
-		sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exp)),
+		// 15s export interval (matches the worker) so RED/SLO dashboards stay
+		// responsive under load; the SDK default of 60s lags the 5m rate window.
+		sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exp, sdkmetric.WithInterval(15*time.Second))),
 		sdkmetric.WithView(
 			sdkmetric.NewView(
 				sdkmetric.Instrument{Name: "http.server.request.duration", Kind: sdkmetric.InstrumentKindHistogram},
@@ -157,6 +184,32 @@ func InitMetrics(ctx context.Context, logger *zap.Logger) (*Metrics, func(contex
 				},
 			),
 		),
+		sdkmetric.WithView(
+			// Sub-second-resolution buckets for the queue lag histogram; the
+			// default buckets are too coarse and inflate the queue-wait SLI.
+			sdkmetric.NewView(
+				sdkmetric.Instrument{Name: "queue.processing.lag", Kind: sdkmetric.InstrumentKindHistogram},
+				sdkmetric.Stream{
+					Aggregation: sdkmetric.AggregationExplicitBucketHistogram{
+						Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5},
+					},
+				},
+			),
+		),
+		sdkmetric.WithView(
+			// Fine, milliseconds-resolution buckets for DB query latency. The
+			// default coarse buckets dump nearly all queries into [0,5), so the
+			// p95 reads ~4.75s — a pure artifact (true mean is ~18ms). These
+			// boundaries (1ms..2.5s) make the db.query.duration p95 meaningful.
+			sdkmetric.NewView(
+				sdkmetric.Instrument{Name: "db.query.duration", Kind: sdkmetric.InstrumentKindHistogram},
+				sdkmetric.Stream{
+					Aggregation: sdkmetric.AggregationExplicitBucketHistogram{
+						Boundaries: []float64{0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5},
+					},
+				},
+			),
+		),
 	)
 
 	otel.SetMeterProvider(mp)
@@ -242,6 +295,11 @@ func initBusinessMetrics(m *Metrics, meter metric.Meter, logger *zap.Logger) {
 	if err != nil {
 		logger.Sugar().Fatalf("Failed to create asset size histogram: %v", err)
 	}
+	m.TenantThrottleTotal, err = meter.Int64Counter("tenant.throttle.total",
+		metric.WithDescription("Per-tenant edge rejections (rate limit / quota)"), metric.WithUnit("{rejection}"))
+	if err != nil {
+		logger.Sugar().Fatalf("Failed to create tenant throttle counter: %v", err)
+	}
 }
 
 func initStorageMetrics(m *Metrics, meter metric.Meter, logger *zap.Logger) {
@@ -280,16 +338,31 @@ func initDatabaseMetrics(m *Metrics, meter metric.Meter, logger *zap.Logger) {
 	if err != nil {
 		logger.Sugar().Fatalf("Failed to create DB query errors: %v", err)
 	}
-	m.DBConnectionsActive, err = meter.Int64UpDownCounter("db.connections.active",
-		metric.WithDescription("Number of active database connections"), metric.WithUnit("{connection}"))
+	m.DBConnectionsActive, err = meter.Int64ObservableGauge("db.connections.active",
+		metric.WithDescription("Number of in-use database connections"), metric.WithUnit("{connection}"))
 	if err != nil {
 		logger.Sugar().Fatalf("Failed to create DB active connections: %v", err)
 	}
-	m.DBConnectionsIdle, err = meter.Int64UpDownCounter("db.connections.idle",
+	m.DBConnectionsIdle, err = meter.Int64ObservableGauge("db.connections.idle",
 		metric.WithDescription("Number of idle database connections"), metric.WithUnit("{connection}"))
 	if err != nil {
 		logger.Sugar().Fatalf("Failed to create DB idle connections: %v", err)
 	}
+	m.DBConnectionsOpen, err = meter.Int64ObservableGauge("db.connections.open",
+		metric.WithDescription("Number of open database connections (in-use + idle)"), metric.WithUnit("{connection}"))
+	if err != nil {
+		logger.Sugar().Fatalf("Failed to create DB open connections: %v", err)
+	}
+	m.DBConnectionsMaxOpen, err = meter.Int64ObservableGauge("db.connections.max_open",
+		metric.WithDescription("Configured max open database connections (0 = unlimited)"), metric.WithUnit("{connection}"))
+	if err != nil {
+		logger.Sugar().Fatalf("Failed to create DB max open connections: %v", err)
+	}
+	m.DBConnectionsWaitCount, err = meter.Int64ObservableGauge("db.connections.wait_count",
+		metric.WithDescription("Cumulative count of connection waits (pool contention)"), metric.WithUnit("{wait}"))
+	if err != nil {
+		logger.Sugar().Fatalf("Failed to create DB wait count: %v", err)
+	}
 	m.DBTransactionTotal, err = meter.Int64Counter("db.transaction.total",
 		metric.WithDescription("Total number of database transactions"), metric.WithUnit("{transaction}"))
 	if err != nil {
diff --git a/internal/metrics/testkit.go b/internal/metrics/testkit.go
new file mode 100644
index 0000000..6f21147
--- /dev/null
+++ b/internal/metrics/testkit.go
@@ -0,0 +1,32 @@
+package metrics
+
+import (
+	"go.uber.org/zap"
+	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
+)
+
+// NewTestMetrics builds a *Metrics backed by a ManualReader instead of the OTLP
+// exporter, so tests can record against the real instruments and then read them
+// back via the returned reader (reader.Collect). It runs the same init* funcs as
+// InitMetrics, so every instrument is non-nil and behaves identically.
+//
+// This lives in a non-_test.go file so it is importable from other packages'
+// tests (e.g. internal/webhook). It pulls in no dependencies beyond sdkmetric,
+// which is already a production dependency of this package.
+func NewTestMetrics() (*Metrics, *sdkmetric.ManualReader) {
+	reader := sdkmetric.NewManualReader()
+	mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader))
+	meter := mp.Meter("mpiper-test")
+
+	logger := zap.NewNop()
+	m := &Metrics{meter: meter}
+	initHTTPMetrics(m, meter, logger)
+	initBusinessMetrics(m, meter, logger)
+	initStorageMetrics(m, meter, logger)
+	initDatabaseMetrics(m, meter, logger)
+	initQueueMetrics(m, meter, logger)
+	initOutboxMetrics(m, meter, logger)
+	initWebhookMetrics(m, meter, logger)
+	initSystemMetrics(m, meter, logger)
+	return m, reader
+}
diff --git a/internal/middleware/authorization.go b/internal/middleware/authorization.go
index fc16b09..a89f982 100644
--- a/internal/middleware/authorization.go
+++ b/internal/middleware/authorization.go
@@ -2,62 +2,112 @@ package middleware
 
 import (
 	"context"
+	"errors"
 	"net/http"
 	"strings"
+	"time"
 
-	"github.com/rndmcodeguy20/mpiper/internal/config"
-	"github.com/rndmcodeguy20/mpiper/pkg/errors"
+	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	apperrors "github.com/rndmcodeguy20/mpiper/pkg/errors"
 	"github.com/rndmcodeguy20/mpiper/pkg/utils"
 	"go.uber.org/zap"
 )
 
 type contextKey string
 
-const userIDKey contextKey = "user_id"
+const (
+	tenantKey contextKey = "tenant_id"
+	scopesKey contextKey = "scopes"
+)
+
+// APIKeyAuthenticator is the subset of the API key repository the auth
+// middleware depends on. Defined here so tests can inject a fake without a DB.
+type APIKeyAuthenticator interface {
+	GetByHash(ctx context.Context, keyHash string) (*repository.APIKey, error)
+}
 
-// AuthMiddleware validates the token, extracts the user ID, and injects it into the context.
-func AuthMiddleware(l *zap.Logger) func(http.Handler) http.Handler {
+// AuthMiddleware authenticates requests via a scoped API key presented as a
+// Bearer credential. The presented key is hashed and looked up; missing,
+// malformed, unknown, expired, or revoked keys are rejected with 401. On
+// success the key's tenant id and scopes are injected into the request context.
+func AuthMiddleware(l *zap.Logger, keys APIKeyAuthenticator) func(http.Handler) http.Handler {
 	return func(next http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			authHeader := r.Header.Get("Authorization")
 			if authHeader == "" {
 				l.Warn("Authorization header is empty")
-				utils.WriteErrorResponse(w, errors.NewUnauthorizedError("Missing Authorization header", nil))
+				utils.WriteErrorResponse(w, apperrors.NewUnauthorizedError("Missing Authorization header", nil))
 				return
 			}
 
 			parts := strings.SplitN(authHeader, " ", 2)
 			if len(parts) != 2 || strings.ToLower(parts[0]) != "bearer" {
 				l.Warn("Invalid Authorization header format")
-				utils.WriteErrorResponse(w, errors.NewUnauthorizedError("Invalid Authorization format", nil))
+				utils.WriteErrorResponse(w, apperrors.NewUnauthorizedError("Invalid Authorization format", nil))
 				return
 			}
 
-			token := parts[1]
-			if token == "" {
-				l.Warn("Token is empty")
-				utils.WriteErrorResponse(w, errors.NewUnauthorizedError("Empty token", nil))
+			presented := parts[1]
+			if presented == "" {
+				l.Warn("API key is empty")
+				utils.WriteErrorResponse(w, apperrors.NewUnauthorizedError("Empty API key", nil))
 				return
 			}
 
-			userID, err := utils.DecryptToken(token, config.MustGet().EncryptionKey)
+			// Validate the wire format before touching the DB. Avoids a lookup
+			// for obviously-bad input and keeps error responses uniform.
+			if _, err := utils.ParseAPIKey(presented); err != nil {
+				l.Warn("Malformed API key")
+				utils.WriteErrorResponse(w, apperrors.NewUnauthorizedError("Invalid API key", nil))
+				return
+			}
+
+			key, err := keys.GetByHash(r.Context(), utils.HashAPIKey(presented))
 			if err != nil {
-				l.Warn("Invalid or expired token", zap.Error(err))
-				utils.WriteErrorResponse(w, errors.NewUnauthorizedError("Invalid token", err))
+				if errors.Is(err, repository.ErrAPIKeyNotFound) {
+					l.Warn("Unknown API key")
+					utils.WriteErrorResponse(w, apperrors.NewUnauthorizedError("Invalid API key", nil))
+					return
+				}
+				l.Error("API key lookup failed", zap.Error(err))
+				utils.WriteErrorResponse(w, apperrors.NewInternalServerError("Authentication failed", err))
+				return
+			}
+
+			now := time.Now()
+			if utils.IsRevoked(key.RevokedAt) {
+				l.Warn("Revoked API key presented", zap.String("prefix", key.Prefix))
+				utils.WriteErrorResponse(w, apperrors.NewUnauthorizedError("API key revoked", nil))
+				return
+			}
+			if utils.IsExpired(key.ExpiresAt, now) {
+				l.Warn("Expired API key presented", zap.String("prefix", key.Prefix))
+				utils.WriteErrorResponse(w, apperrors.NewUnauthorizedError("API key expired", nil))
 				return
 			}
 
-			ctx := context.WithValue(r.Context(), userIDKey, userID)
+			ctx := context.WithValue(r.Context(), tenantKey, key.TenantID)
+			ctx = context.WithValue(ctx, scopesKey, key.Scopes())
 			next.ServeHTTP(w, r.WithContext(ctx))
 		})
 	}
 }
 
-// GetUserID extracts the user ID from context safely.
-func GetUserID(ctx context.Context) (string, bool) {
-	userID, ok := ctx.Value(userIDKey).(string)
-	return userID, ok
+// GetTenant extracts the authenticated tenant id from context safely.
+func GetTenant(ctx context.Context) (string, bool) {
+	tenant, ok := ctx.Value(tenantKey).(string)
+	return tenant, ok
 }
 
-// UserIDKey returns the context key used for storing user_id. Exported for testing.
-func UserIDKey() contextKey { return userIDKey }
+// GetScopes extracts the authenticated key's scopes from context.
+func GetScopes(ctx context.Context) []string {
+	scopes, _ := ctx.Value(scopesKey).([]string)
+	return scopes
+}
+
+// WithTenant returns a context carrying the given tenant id. Exported for
+// tests and internal callers that need to set the tenant without going through
+// the HTTP middleware.
+func WithTenant(ctx context.Context, tenant string) context.Context {
+	return context.WithValue(ctx, tenantKey, tenant)
+}
diff --git a/internal/middleware/authorization_test.go b/internal/middleware/authorization_test.go
index de6ae35..72d8b8c 100644
--- a/internal/middleware/authorization_test.go
+++ b/internal/middleware/authorization_test.go
@@ -1,35 +1,59 @@
 package middleware
 
 import (
+	"context"
 	"net/http"
 	"net/http/httptest"
 	"testing"
+	"time"
 
-	"github.com/rndmcodeguy20/mpiper/internal/config"
+	"github.com/google/uuid"
+	"github.com/rndmcodeguy20/mpiper/internal/repository"
 	"github.com/rndmcodeguy20/mpiper/pkg/utils"
 	"go.uber.org/zap"
 )
 
-// 32-byte AES-256 key for the test singleton.
-const testEncryptionKey = "0123456789abcdef0123456789abcdef"
+// fakeAuthenticator is an in-memory APIKeyAuthenticator keyed by hash.
+type fakeAuthenticator struct {
+	byHash map[string]*repository.APIKey
+	err    error
+}
 
-func TestMain(m *testing.M) {
-	config.Init(config.EnvConfig{EncryptionKey: testEncryptionKey})
-	m.Run()
+func (f *fakeAuthenticator) GetByHash(_ context.Context, keyHash string) (*repository.APIKey, error) {
+	if f.err != nil {
+		return nil, f.err
+	}
+	k, ok := f.byHash[keyHash]
+	if !ok {
+		return nil, repository.ErrAPIKeyNotFound
+	}
+	return k, nil
 }
 
-// newGate wraps a handler that records whether it ran with AuthMiddleware.
-func newGate(t *testing.T) (http.Handler, *bool) {
-	t.Helper()
-	called := false
-	next := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		called = true
-		w.WriteHeader(http.StatusOK)
-	})
-	return AuthMiddleware(zap.NewNop())(next), &called
+// mintKey generates a valid API key and registers it in the fake with the
+// given tenant/expiry/revocation, returning the plaintext key.
+func mintKey(f *fakeAuthenticator, tenant string, expiresAt, revokedAt *time.Time) string {
+	mat, err := utils.GenerateAPIKey()
+	if err != nil {
+		panic(err)
+	}
+	if f.byHash == nil {
+		f.byHash = map[string]*repository.APIKey{}
+	}
+	f.byHash[mat.Hash] = &repository.APIKey{
+		ID:        uuid.New(),
+		TenantID:  tenant,
+		KeyHash:   mat.Hash,
+		Prefix:    mat.Prefix,
+		ScopesRaw: []byte(`["assets:write"]`),
+		ExpiresAt: expiresAt,
+		RevokedAt: revokedAt,
+	}
+	return mat.Full
 }
 
 func TestAuthMiddleware_RejectsUnauthenticated(t *testing.T) {
+	f := &fakeAuthenticator{}
 	tests := []struct {
 		name   string
 		header string
@@ -37,59 +61,110 @@ func TestAuthMiddleware_RejectsUnauthenticated(t *testing.T) {
 		{"missing header", ""},
 		{"non-bearer scheme", "Basic abc123"},
 		{"bearer without token", "Bearer "},
-		{"malformed token", "Bearer not-a-valid-token"},
+		{"malformed key", "Bearer not-a-valid-key"},
+		{"unknown key", "Bearer " + func() string { m, _ := utils.GenerateAPIKey(); return m.Full }()},
 	}
 
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
-			gate, called := newGate(t)
+			called := false
+			next := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				called = true
+				w.WriteHeader(http.StatusOK)
+			})
+			gate := AuthMiddleware(zap.NewNop(), f)(next)
 
 			req := httptest.NewRequest(http.MethodGet, "/api/v1/assets/x/complete", nil)
 			if tc.header != "" {
 				req.Header.Set("Authorization", tc.header)
 			}
 			rec := httptest.NewRecorder()
-
 			gate.ServeHTTP(rec, req)
 
 			if rec.Code != http.StatusUnauthorized {
 				t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized)
 			}
-			if *called {
+			if called {
 				t.Error("next handler ran for unauthenticated request — gate leaked")
 			}
 		})
 	}
 }
 
-func TestAuthMiddleware_AllowsValidTokenAndPopulatesUserID(t *testing.T) {
-	const wantUserID = "user-42"
-	token, err := utils.GenerateToken(wantUserID, testEncryptionKey)
-	if err != nil {
-		t.Fatalf("GenerateToken: %v", err)
-	}
+func TestAuthMiddleware_AllowsValidKeyAndPopulatesTenant(t *testing.T) {
+	const wantTenant = "tenant-42"
+	f := &fakeAuthenticator{}
+	key := mintKey(f, wantTenant, nil, nil)
 
-	var gotUserID string
+	var gotTenant string
 	var gotOK bool
+	var gotScopes []string
 	next := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		gotUserID, gotOK = GetUserID(r.Context())
+		gotTenant, gotOK = GetTenant(r.Context())
+		gotScopes = GetScopes(r.Context())
 		w.WriteHeader(http.StatusOK)
 	})
-	gate := AuthMiddleware(zap.NewNop())(next)
+	gate := AuthMiddleware(zap.NewNop(), f)(next)
 
 	req := httptest.NewRequest(http.MethodGet, "/api/v1/assets/x/complete", nil)
-	req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("Authorization", "Bearer "+key)
 	rec := httptest.NewRecorder()
-
 	gate.ServeHTTP(rec, req)
 
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK)
 	}
 	if !gotOK {
-		t.Fatal("GetUserID returned ok=false — userID not injected into context")
+		t.Fatal("GetTenant returned ok=false — tenant not injected into context")
+	}
+	if gotTenant != wantTenant {
+		t.Errorf("tenant = %q, want %q", gotTenant, wantTenant)
+	}
+	if len(gotScopes) != 1 || gotScopes[0] != "assets:write" {
+		t.Errorf("scopes = %v, want [assets:write]", gotScopes)
+	}
+}
+
+func TestAuthMiddleware_RejectsExpiredKey(t *testing.T) {
+	f := &fakeAuthenticator{}
+	past := time.Now().Add(-time.Hour)
+	key := mintKey(f, "tenant-1", &past, nil)
+
+	called := false
+	next := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { called = true })
+	gate := AuthMiddleware(zap.NewNop(), f)(next)
+
+	req := httptest.NewRequest(http.MethodGet, "/api/v1/assets/x/complete", nil)
+	req.Header.Set("Authorization", "Bearer "+key)
+	rec := httptest.NewRecorder()
+	gate.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusUnauthorized {
+		t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized)
+	}
+	if called {
+		t.Error("next handler ran for expired key")
+	}
+}
+
+func TestAuthMiddleware_RejectsRevokedKey(t *testing.T) {
+	f := &fakeAuthenticator{}
+	revoked := time.Now().Add(-time.Minute)
+	key := mintKey(f, "tenant-1", nil, &revoked)
+
+	called := false
+	next := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { called = true })
+	gate := AuthMiddleware(zap.NewNop(), f)(next)
+
+	req := httptest.NewRequest(http.MethodGet, "/api/v1/assets/x/complete", nil)
+	req.Header.Set("Authorization", "Bearer "+key)
+	rec := httptest.NewRecorder()
+	gate.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusUnauthorized {
+		t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized)
 	}
-	if gotUserID != wantUserID {
-		t.Errorf("userID = %q, want %q", gotUserID, wantUserID)
+	if called {
+		t.Error("next handler ran for revoked key")
 	}
 }
diff --git a/internal/middleware/idempotency.go b/internal/middleware/idempotency.go
new file mode 100644
index 0000000..ad1d0c1
--- /dev/null
+++ b/internal/middleware/idempotency.go
@@ -0,0 +1,144 @@
+package middleware
+
+import (
+	"bytes"
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"io"
+	"net/http"
+	"time"
+
+	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	apperrors "github.com/rndmcodeguy20/mpiper/pkg/errors"
+	"github.com/rndmcodeguy20/mpiper/pkg/utils"
+	"go.uber.org/zap"
+)
+
+// IdempotencyKeyHeader is the request header clients send to make a mutating
+// request safely retryable.
+const IdempotencyKeyHeader = "Idempotency-Key"
+
+// IdempotencyStore is the subset of the idempotency repository the middleware
+// needs. Defined here so tests can inject a fake.
+type IdempotencyStore interface {
+	Acquire(ctx context.Context, tenant, key, fingerprint string, ttl time.Duration) (repository.AcquireOutcome, *repository.IdempotencyRecord, error)
+	Complete(ctx context.Context, tenant, key string, status int, body []byte) error
+	Release(ctx context.Context, tenant, key string) error
+}
+
+// captureWriter records the status code and body while still writing through to
+// the underlying ResponseWriter, so the response can be persisted for replay.
+type captureWriter struct {
+	http.ResponseWriter
+	status int
+	buf    bytes.Buffer
+}
+
+func (c *captureWriter) WriteHeader(status int) {
+	c.status = status
+	c.ResponseWriter.WriteHeader(status)
+}
+
+func (c *captureWriter) Write(b []byte) (int, error) {
+	c.buf.Write(b)
+	return c.ResponseWriter.Write(b)
+}
+
+// IdempotencyMiddleware provides Stripe-style idempotency for mutating requests.
+// When an Idempotency-Key header is present (and a tenant is authenticated):
+//   - the first request executes the handler and its response is stored;
+//   - a replay with the same key + same request returns the stored response;
+//   - the same key with a different request body returns 422;
+//   - a concurrent duplicate still in flight returns 409.
+//
+// Requests without the header pass straight through unchanged. The middleware
+// must run AFTER AuthMiddleware so the tenant is in context.
+func IdempotencyMiddleware(l *zap.Logger, store IdempotencyStore, ttl time.Duration) func(http.Handler) http.Handler {
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			key := r.Header.Get(IdempotencyKeyHeader)
+			if key == "" {
+				next.ServeHTTP(w, r)
+				return
+			}
+
+			tenant, ok := GetTenant(r.Context())
+			if !ok || tenant == "" {
+				// No tenant (auth should have set it) — nothing to scope by.
+				next.ServeHTTP(w, r)
+				return
+			}
+
+			// Buffer the body so we can fingerprint it AND let the handler read
+			// it again.
+			var body []byte
+			if r.Body != nil {
+				b, err := io.ReadAll(r.Body)
+				if err != nil {
+					utils.WriteErrorResponse(w, apperrors.NewBadRequestError("Failed to read request body", err))
+					return
+				}
+				body = b
+				_ = r.Body.Close()
+				r.Body = io.NopCloser(bytes.NewReader(body))
+			}
+			fingerprint := fingerprintRequest(r.Method, r.URL.Path, body)
+
+			outcome, rec, err := store.Acquire(r.Context(), tenant, key, fingerprint, ttl)
+			if err != nil {
+				l.Error("idempotency acquire failed", zap.Error(err))
+				utils.WriteErrorResponse(w, apperrors.NewInternalServerError("Idempotency check failed", err))
+				return
+			}
+
+			switch outcome {
+			case repository.AcquireReplay:
+				w.Header().Set("Content-Type", "application/json")
+				w.Header().Set("Idempotent-Replayed", "true")
+				status := rec.ResponseStatus
+				if status == 0 {
+					status = http.StatusOK
+				}
+				w.WriteHeader(status)
+				_, _ = w.Write(rec.ResponseBody)
+				return
+			case repository.AcquireMismatch:
+				utils.WriteErrorResponse(w, apperrors.NewUnprocessableEntityError(
+					"Idempotency-Key reused for a different request", nil))
+				return
+			case repository.AcquireInFlight:
+				utils.WriteErrorResponse(w, apperrors.NewConflictError(
+					"A request with this Idempotency-Key is already in progress", nil))
+				return
+			}
+
+			// AcquireAcquired — run the handler, capturing the response.
+			cw := &captureWriter{ResponseWriter: w, status: http.StatusOK}
+			next.ServeHTTP(cw, r)
+
+			// Cache only non-server-error responses. On 5xx, release the key so
+			// the client may retry the operation.
+			if cw.status >= 500 {
+				if err := store.Release(r.Context(), tenant, key); err != nil {
+					l.Warn("idempotency release failed", zap.Error(err))
+				}
+				return
+			}
+			if err := store.Complete(r.Context(), tenant, key, cw.status, cw.buf.Bytes()); err != nil {
+				// The client already received a valid response; just log.
+				l.Warn("idempotency complete failed", zap.Error(err))
+			}
+		})
+	}
+}
+
+func fingerprintRequest(method, path string, body []byte) string {
+	h := sha256.New()
+	h.Write([]byte(method))
+	h.Write([]byte{0})
+	h.Write([]byte(path))
+	h.Write([]byte{0})
+	h.Write(body)
+	return hex.EncodeToString(h.Sum(nil))
+}
diff --git a/internal/middleware/idempotency_test.go b/internal/middleware/idempotency_test.go
new file mode 100644
index 0000000..750b46c
--- /dev/null
+++ b/internal/middleware/idempotency_test.go
@@ -0,0 +1,146 @@
+package middleware
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	"go.uber.org/zap"
+)
+
+type fakeIdemStore struct {
+	outcome   repository.AcquireOutcome
+	rec       *repository.IdempotencyRecord
+	acquired  int
+	complete  int
+	released  int
+	gotStatus int
+	gotBody   []byte
+}
+
+func (f *fakeIdemStore) Acquire(_ context.Context, _, _, _ string, _ time.Duration) (repository.AcquireOutcome, *repository.IdempotencyRecord, error) {
+	f.acquired++
+	return f.outcome, f.rec, nil
+}
+func (f *fakeIdemStore) Complete(_ context.Context, _, _ string, status int, body []byte) error {
+	f.complete++
+	f.gotStatus = status
+	f.gotBody = body
+	return nil
+}
+func (f *fakeIdemStore) Release(_ context.Context, _, _ string) error {
+	f.released++
+	return nil
+}
+
+func runIdem(store *fakeIdemStore, key string, tenant string, handler http.HandlerFunc) *httptest.ResponseRecorder {
+	mw := IdempotencyMiddleware(zap.NewNop(), store, time.Hour)(handler)
+	req := httptest.NewRequest(http.MethodPost, "/api/v1/storage/presign", strings.NewReader(`{"x":1}`))
+	if key != "" {
+		req.Header.Set(IdempotencyKeyHeader, key)
+	}
+	if tenant != "" {
+		req = req.WithContext(WithTenant(req.Context(), tenant))
+	}
+	rec := httptest.NewRecorder()
+	mw.ServeHTTP(rec, req)
+	return rec
+}
+
+func TestIdempotency_NoHeader_PassesThrough(t *testing.T) {
+	store := &fakeIdemStore{outcome: repository.AcquireAcquired}
+	ran := false
+	rec := runIdem(store, "", "tenant-1", func(w http.ResponseWriter, r *http.Request) {
+		ran = true
+		w.WriteHeader(http.StatusOK)
+	})
+	if !ran {
+		t.Error("handler should run when no Idempotency-Key present")
+	}
+	if store.acquired != 0 {
+		t.Error("store should not be consulted without a key")
+	}
+	if rec.Code != http.StatusOK {
+		t.Errorf("status = %d, want 200", rec.Code)
+	}
+}
+
+func TestIdempotency_Acquired_RunsHandlerAndStores(t *testing.T) {
+	store := &fakeIdemStore{outcome: repository.AcquireAcquired}
+	rec := runIdem(store, "key-1", "tenant-1", func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte(`{"assetId":"abc"}`))
+	})
+	if store.complete != 1 {
+		t.Errorf("Complete calls = %d, want 1", store.complete)
+	}
+	if store.gotStatus != http.StatusOK {
+		t.Errorf("stored status = %d, want 200", store.gotStatus)
+	}
+	if string(store.gotBody) != `{"assetId":"abc"}` {
+		t.Errorf("stored body = %q", string(store.gotBody))
+	}
+	if rec.Body.String() != `{"assetId":"abc"}` {
+		t.Errorf("response body = %q", rec.Body.String())
+	}
+}
+
+func TestIdempotency_Replay_ReturnsStoredResponse(t *testing.T) {
+	store := &fakeIdemStore{
+		outcome: repository.AcquireReplay,
+		rec:     &repository.IdempotencyRecord{ResponseStatus: http.StatusOK, ResponseBody: []byte(`{"assetId":"abc"}`)},
+	}
+	ran := false
+	rec := runIdem(store, "key-1", "tenant-1", func(w http.ResponseWriter, r *http.Request) {
+		ran = true
+	})
+	if ran {
+		t.Error("handler must NOT run on replay")
+	}
+	if rec.Code != http.StatusOK {
+		t.Errorf("status = %d, want 200", rec.Code)
+	}
+	if rec.Body.String() != `{"assetId":"abc"}` {
+		t.Errorf("replayed body = %q", rec.Body.String())
+	}
+	if rec.Header().Get("Idempotent-Replayed") != "true" {
+		t.Error("replay should set Idempotent-Replayed header")
+	}
+}
+
+func TestIdempotency_Mismatch_Returns422(t *testing.T) {
+	store := &fakeIdemStore{outcome: repository.AcquireMismatch}
+	ran := false
+	rec := runIdem(store, "key-1", "tenant-1", func(w http.ResponseWriter, r *http.Request) { ran = true })
+	if ran {
+		t.Error("handler must NOT run on fingerprint mismatch")
+	}
+	if rec.Code != http.StatusUnprocessableEntity {
+		t.Errorf("status = %d, want 422", rec.Code)
+	}
+}
+
+func TestIdempotency_InFlight_Returns409(t *testing.T) {
+	store := &fakeIdemStore{outcome: repository.AcquireInFlight}
+	rec := runIdem(store, "key-1", "tenant-1", func(w http.ResponseWriter, r *http.Request) {})
+	if rec.Code != http.StatusConflict {
+		t.Errorf("status = %d, want 409", rec.Code)
+	}
+}
+
+func TestIdempotency_ServerError_ReleasesKey(t *testing.T) {
+	store := &fakeIdemStore{outcome: repository.AcquireAcquired}
+	runIdem(store, "key-1", "tenant-1", func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+	})
+	if store.released != 1 {
+		t.Errorf("Release calls = %d, want 1 (5xx must not be cached)", store.released)
+	}
+	if store.complete != 0 {
+		t.Errorf("Complete calls = %d, want 0 for 5xx", store.complete)
+	}
+}
diff --git a/internal/middleware/logging.go b/internal/middleware/logging.go
index c046ab2..7c453be 100644
--- a/internal/middleware/logging.go
+++ b/internal/middleware/logging.go
@@ -37,6 +37,11 @@ func LoggerMiddleware(l *zap.Logger) func(next http.Handler) http.Handler {
 				zap.String("proto", r.Proto),
 			)
 
+			// Stamp trace_id/span_id from the active span (TracingMiddleware runs
+			// before this) so request logs cross-link to their Tempo trace and
+			// any handler/service using the context logger inherits the IDs.
+			reqLogger = applogger.WithTrace(r.Context(), reqLogger)
+
 			ctx := applogger.WithLogger(r.Context(), reqLogger)
 			r = r.WithContext(ctx)
 
diff --git a/internal/middleware/metrics.go b/internal/middleware/metrics.go
index 5e02ae4..542c9be 100644
--- a/internal/middleware/metrics.go
+++ b/internal/middleware/metrics.go
@@ -32,43 +32,51 @@ func MetricsMiddleware(m *metrics.Metrics) func(http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			start := time.Now()
 
-			route := chi.RouteContext(r.Context()).RoutePattern()
-			if route == "" {
-				route = "unknown"
-			}
-
 			wrapped := &metricsResponseWriter{ResponseWriter: w, statusCode: http.StatusOK}
 
-			attrs := []attribute.KeyValue{
-				attribute.String("http.method", r.Method),
-				attribute.String("http.route", route),
-			}
+			// In-flight gauge: keyed by method only. The route pattern is not yet
+			// known here (chi populates it during routing, after this middleware),
+			// so it would be "unknown"; using it on +1/-1 still nets to zero but
+			// adds no value. Per-route labels are applied post-routing below.
+			inflightAttrs := []attribute.KeyValue{attribute.String("http.method", r.Method)}
 
 			if m != nil {
-				m.HTTPActiveRequests.Add(r.Context(), 1, metric.WithAttributes(attrs...))
-				defer m.HTTPActiveRequests.Add(r.Context(), -1, metric.WithAttributes(attrs...))
+				m.HTTPActiveRequests.Add(r.Context(), 1, metric.WithAttributes(inflightAttrs...))
+				defer m.HTTPActiveRequests.Add(r.Context(), -1, metric.WithAttributes(inflightAttrs...))
 			}
 
 			defer func() {
 				if rec := recover(); rec != nil {
 					wrapped.statusCode = http.StatusInternalServerError
-					recordHTTPMetrics(m, r, wrapped, start, attrs)
+					recordHTTPMetrics(m, r, wrapped, start)
 					panic(rec)
 				}
 			}()
 
 			next.ServeHTTP(wrapped, r)
-			recordHTTPMetrics(m, r, wrapped, start, attrs)
+			recordHTTPMetrics(m, r, wrapped, start)
 		})
 	}
 }
 
-func recordHTTPMetrics(m *metrics.Metrics, r *http.Request, w *metricsResponseWriter, start time.Time, baseAttrs []attribute.KeyValue) {
+func recordHTTPMetrics(m *metrics.Metrics, r *http.Request, w *metricsResponseWriter, start time.Time) {
 	if m == nil {
 		return
 	}
+	// chi populates the matched route pattern during routing, so it is only
+	// available now (after ServeHTTP). Reading it earlier yields "" — the source
+	// of the previous "unknown" http_route label that broke per-route SLOs.
+	route := chi.RouteContext(r.Context()).RoutePattern()
+	if route == "" {
+		route = "unknown"
+	}
+
 	duration := time.Since(start).Seconds()
-	attrs := append(baseAttrs, attribute.Int("http.status_code", w.statusCode))
+	attrs := []attribute.KeyValue{
+		attribute.String("http.method", r.Method),
+		attribute.String("http.route", route),
+		attribute.Int("http.status_code", w.statusCode),
+	}
 
 	m.HTTPRequestDuration.Record(r.Context(), duration, metric.WithAttributes(attrs...))
 	m.HTTPRequestCount.Add(r.Context(), 1, metric.WithAttributes(attrs...))
diff --git a/internal/middleware/tenant_quota.go b/internal/middleware/tenant_quota.go
new file mode 100644
index 0000000..56423c7
--- /dev/null
+++ b/internal/middleware/tenant_quota.go
@@ -0,0 +1,126 @@
+package middleware
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"sync"
+	"time"
+
+	"github.com/rndmcodeguy20/mpiper/internal/metrics"
+	apperrors "github.com/rndmcodeguy20/mpiper/pkg/errors"
+	"github.com/rndmcodeguy20/mpiper/pkg/utils"
+	"go.opentelemetry.io/otel/attribute"
+	otelmetric "go.opentelemetry.io/otel/metric"
+	"go.uber.org/zap"
+	"golang.org/x/time/rate"
+)
+
+// AssetCounter reports how many assets a tenant owns (for quota enforcement).
+type AssetCounter interface {
+	CountByOwner(ctx context.Context, tenantID string) (int64, error)
+}
+
+// recordThrottle increments the throttle metric with a low-cardinality reason.
+func recordThrottle(ctx context.Context, m *metrics.Metrics, reason string) {
+	if m == nil || m.TenantThrottleTotal == nil {
+		return
+	}
+	m.TenantThrottleTotal.Add(ctx, 1, otelmetric.WithAttributes(attribute.String("reason", reason)))
+}
+
+// TenantRateLimitMiddleware applies a per-tenant token-bucket rate limit.
+// Each tenant gets `rps` sustained requests/second with a burst of `burst`.
+// Over-limit requests get 429 + Retry-After. Idle tenant limiters are evicted.
+func TenantRateLimitMiddleware(l *zap.Logger, m *metrics.Metrics, rps float64, burst int) func(http.Handler) http.Handler {
+	type entry struct {
+		lim      *rate.Limiter
+		lastSeen time.Time
+	}
+	var (
+		mu      sync.Mutex
+		tenants = make(map[string]*entry)
+	)
+
+	// Evict tenants not seen in the last 10 minutes to bound memory.
+	go func() {
+		for range time.Tick(time.Minute) {
+			mu.Lock()
+			for t, e := range tenants {
+				if time.Since(e.lastSeen) > 10*time.Minute {
+					delete(tenants, t)
+				}
+			}
+			mu.Unlock()
+		}
+	}()
+
+	getLimiter := func(tenant string) *rate.Limiter {
+		mu.Lock()
+		defer mu.Unlock()
+		e, ok := tenants[tenant]
+		if !ok {
+			e = &entry{lim: rate.NewLimiter(rate.Limit(rps), burst)}
+			tenants[tenant] = e
+		}
+		e.lastSeen = time.Now()
+		return e.lim
+	}
+
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			tenant, ok := GetTenant(r.Context())
+			if !ok || tenant == "" {
+				// No tenant to key on (auth should have set it) — don't block.
+				next.ServeHTTP(w, r)
+				return
+			}
+			if !getLimiter(tenant).Allow() {
+				recordThrottle(r.Context(), m, "rate_limit")
+				l.Warn("tenant rate limit exceeded", zap.String("tenant", tenant))
+				// Suggest a retry delay derived from the sustained rate.
+				retryAfter := 1
+				if rps > 0 {
+					if ra := int(1.0 / rps); ra > retryAfter {
+						retryAfter = ra
+					}
+				}
+				w.Header().Set("Retry-After", fmt.Sprintf("%d", retryAfter))
+				utils.WriteErrorResponse(w, apperrors.NewTooManyRequestsError("Rate limit exceeded", nil))
+				return
+			}
+			next.ServeHTTP(w, r)
+		})
+	}
+}
+
+// TenantQuotaMiddleware enforces a per-tenant asset-count quota. When quota is
+// 0 the middleware is a no-op. A tenant at or above its quota gets 403.
+func TenantQuotaMiddleware(l *zap.Logger, m *metrics.Metrics, counter AssetCounter, quota int64) func(http.Handler) http.Handler {
+	return func(next http.Handler) http.Handler {
+		if quota <= 0 {
+			return next // unlimited — skip the DB count entirely
+		}
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			tenant, ok := GetTenant(r.Context())
+			if !ok || tenant == "" {
+				next.ServeHTTP(w, r)
+				return
+			}
+			count, err := counter.CountByOwner(r.Context(), tenant)
+			if err != nil {
+				l.Error("quota count failed", zap.String("tenant", tenant), zap.Error(err))
+				utils.WriteErrorResponse(w, apperrors.NewInternalServerError("Quota check failed", err))
+				return
+			}
+			if count >= quota {
+				recordThrottle(r.Context(), m, "quota")
+				l.Warn("tenant asset quota exceeded", zap.String("tenant", tenant), zap.Int64("count", count), zap.Int64("quota", quota))
+				utils.WriteErrorResponse(w, apperrors.NewForbiddenError(
+					fmt.Sprintf("Asset quota exceeded (%d/%d)", count, quota), nil))
+				return
+			}
+			next.ServeHTTP(w, r)
+		})
+	}
+}
diff --git a/internal/middleware/tenant_quota_test.go b/internal/middleware/tenant_quota_test.go
new file mode 100644
index 0000000..7158342
--- /dev/null
+++ b/internal/middleware/tenant_quota_test.go
@@ -0,0 +1,101 @@
+package middleware
+
+import (
+	"context"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"go.uber.org/zap"
+)
+
+func serveWithTenant(mw func(http.Handler) http.Handler, tenant string, handler http.HandlerFunc) *httptest.ResponseRecorder {
+	gate := mw(handler)
+	req := httptest.NewRequest(http.MethodPost, "/api/v1/storage/presign", nil)
+	if tenant != "" {
+		req = req.WithContext(WithTenant(req.Context(), tenant))
+	}
+	rec := httptest.NewRecorder()
+	gate.ServeHTTP(rec, req)
+	return rec
+}
+
+func okHandler() http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) }
+}
+
+func TestTenantRateLimit_ThrottlesPerTenant(t *testing.T) {
+	// rps=1, burst=1: the first request passes, an immediate second is throttled.
+	mw := TenantRateLimitMiddleware(zap.NewNop(), nil, 1, 1)
+
+	if rec := serveWithTenant(mw, "tenant-a", okHandler()); rec.Code != http.StatusOK {
+		t.Fatalf("first request for tenant-a = %d, want 200", rec.Code)
+	}
+	rec := serveWithTenant(mw, "tenant-a", okHandler())
+	if rec.Code != http.StatusTooManyRequests {
+		t.Errorf("second request for tenant-a = %d, want 429", rec.Code)
+	}
+	if rec.Header().Get("Retry-After") == "" {
+		t.Error("429 response should carry a Retry-After header")
+	}
+
+	// A different tenant has its own bucket and is unaffected.
+	if rec := serveWithTenant(mw, "tenant-b", okHandler()); rec.Code != http.StatusOK {
+		t.Errorf("first request for tenant-b = %d, want 200 (per-tenant isolation)", rec.Code)
+	}
+}
+
+func TestTenantRateLimit_NoTenantPassesThrough(t *testing.T) {
+	mw := TenantRateLimitMiddleware(zap.NewNop(), nil, 1, 1)
+	// No tenant in context -> not blocked (auth would normally have set it).
+	for i := 0; i < 3; i++ {
+		if rec := serveWithTenant(mw, "", okHandler()); rec.Code != http.StatusOK {
+			t.Fatalf("request %d without tenant = %d, want 200", i, rec.Code)
+		}
+	}
+}
+
+// fakeCounter implements AssetCounter.
+type fakeCounter struct {
+	count int64
+	err   error
+}
+
+func (f *fakeCounter) CountByOwner(_ context.Context, _ string) (int64, error) {
+	return f.count, f.err
+}
+
+func TestTenantQuota_BlocksOverQuota(t *testing.T) {
+	mw := TenantQuotaMiddleware(zap.NewNop(), nil, &fakeCounter{count: 5}, 5)
+	rec := serveWithTenant(mw, "tenant-a", okHandler())
+	if rec.Code != http.StatusForbidden {
+		t.Errorf("at-quota request = %d, want 403", rec.Code)
+	}
+}
+
+func TestTenantQuota_AllowsUnderQuota(t *testing.T) {
+	mw := TenantQuotaMiddleware(zap.NewNop(), nil, &fakeCounter{count: 2}, 5)
+	rec := serveWithTenant(mw, "tenant-a", okHandler())
+	if rec.Code != http.StatusOK {
+		t.Errorf("under-quota request = %d, want 200", rec.Code)
+	}
+}
+
+func TestTenantQuota_ZeroMeansUnlimited(t *testing.T) {
+	// quota=0 -> middleware is a no-op and never calls the counter.
+	counter := &fakeCounter{err: errors.New("should not be called")}
+	mw := TenantQuotaMiddleware(zap.NewNop(), nil, counter, 0)
+	rec := serveWithTenant(mw, "tenant-a", okHandler())
+	if rec.Code != http.StatusOK {
+		t.Errorf("unlimited quota request = %d, want 200", rec.Code)
+	}
+}
+
+func TestTenantQuota_CounterErrorReturns500(t *testing.T) {
+	mw := TenantQuotaMiddleware(zap.NewNop(), nil, &fakeCounter{err: errors.New("db down")}, 5)
+	rec := serveWithTenant(mw, "tenant-a", okHandler())
+	if rec.Code != http.StatusInternalServerError {
+		t.Errorf("counter error = %d, want 500", rec.Code)
+	}
+}
diff --git a/internal/models/outbox.go b/internal/models/outbox.go
index 297883c..58647d8 100644
--- a/internal/models/outbox.go
+++ b/internal/models/outbox.go
@@ -13,6 +13,12 @@ type OutboxEvent struct {
 	JobID       *int64          `db:"job_id"`
 	Event       string          `db:"event"`
 	Payload     json.RawMessage `db:"payload"`
+	// Traceparent carries the W3C trace context captured when the row was
+	// written, so the distributed trace survives the outbox store-and-forward
+	// hop. The relay re-activates it before publishing to Redis. Nullable:
+	// rows written before this column existed (or without an active span) have
+	// no trace context.
+	Traceparent *string         `db:"traceparent"`
 	Status      string          `db:"status"`
 	Attempts    int             `db:"attempts"`
 	MaxAttempts int             `db:"max_attempts"`
diff --git a/internal/outbox/relay.go b/internal/outbox/relay.go
index ae485b5..6e2f8a8 100644
--- a/internal/outbox/relay.go
+++ b/internal/outbox/relay.go
@@ -8,6 +8,11 @@ import (
 	"github.com/rndmcodeguy20/mpiper/internal/metrics"
 	"github.com/rndmcodeguy20/mpiper/internal/queue"
 	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/trace"
 	"go.uber.org/zap"
 )
 
@@ -17,12 +22,13 @@ type Relay struct {
 	queue    queue.Queue
 	logger   *zap.Logger
 	m        *metrics.Metrics
+	tracer   trace.Tracer
 	interval time.Duration
 	batch    int
 }
 
 func NewRelay(repo repository.OutboxRepository, q queue.Queue, logger *zap.Logger, m *metrics.Metrics, interval time.Duration, batch int) *Relay {
-	return &Relay{repo: repo, queue: q, logger: logger, m: m, interval: interval, batch: batch}
+	return &Relay{repo: repo, queue: q, logger: logger, m: m, tracer: otel.Tracer("mpiper-api"), interval: interval, batch: batch}
 }
 
 // Start runs the relay loop until ctx is cancelled. It finishes the in-flight batch before returning.
@@ -71,8 +77,26 @@ func (r *Relay) tick(ctx context.Context) {
 			continue
 		}
 
-		if _, err := r.queue.Enqueue(ctx, payload); err != nil {
+		// Re-activate the producer's trace context (captured when the row was
+		// written) so the publish + enqueue spans rejoin the original request
+		// trace instead of starting a disconnected root. tick() runs on a
+		// background ticker context, so without this the trace would break here.
+		publishCtx := ctx
+		if row.Traceparent != nil && *row.Traceparent != "" {
+			carrier := propagation.MapCarrier{"traceparent": *row.Traceparent}
+			publishCtx = otel.GetTextMapPropagator().Extract(ctx, carrier)
+		}
+		publishCtx, span := r.tracer.Start(publishCtx, "outbox.publish")
+		span.SetAttributes(
+			attribute.Int64("outbox.row_id", row.ID),
+			attribute.String("event", row.Event),
+		)
+
+		if _, err := r.queue.Enqueue(publishCtx, payload); err != nil {
 			r.logger.Warn("outbox relay: enqueue failed", zap.Int64("id", row.ID), zap.Error(err))
+			span.RecordError(err)
+			span.SetStatus(codes.Error, "enqueue failed")
+			span.End()
 			_ = r.repo.IncrementAttempts(ctx, row.ID, err.Error())
 			if row.Attempts+1 >= row.MaxAttempts {
 				_ = r.repo.MarkFailed(ctx, row.ID, err.Error())
@@ -82,6 +106,7 @@ func (r *Relay) tick(ctx context.Context) {
 			}
 			continue
 		}
+		span.End()
 
 		publishedIDs = append(publishedIDs, row.ID)
 	}
diff --git a/internal/outbox/relay_trace_test.go b/internal/outbox/relay_trace_test.go
new file mode 100644
index 0000000..9fc7f54
--- /dev/null
+++ b/internal/outbox/relay_trace_test.go
@@ -0,0 +1,124 @@
+package outbox
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/rndmcodeguy20/mpiper/internal/models"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/trace"
+	"go.uber.org/zap"
+)
+
+// fakeOutboxRepo is an in-memory OutboxRepository for white-box relay tests.
+type fakeOutboxRepo struct {
+	pending     []models.OutboxEvent
+	published   []int64
+	incremented []int64
+	failed      []int64
+}
+
+func (f *fakeOutboxRepo) InsertTx(_ context.Context, _ *sql.Tx, _ models.OutboxEvent) error {
+	return nil
+}
+func (f *fakeOutboxRepo) FetchPendingBatch(_ context.Context, _ int) ([]models.OutboxEvent, error) {
+	out := f.pending
+	f.pending = nil // single tick
+	return out, nil
+}
+func (f *fakeOutboxRepo) MarkPublished(_ context.Context, ids []int64) error {
+	f.published = append(f.published, ids...)
+	return nil
+}
+func (f *fakeOutboxRepo) IncrementAttempts(_ context.Context, id int64, _ string) error {
+	f.incremented = append(f.incremented, id)
+	return nil
+}
+func (f *fakeOutboxRepo) MarkFailed(_ context.Context, id int64, _ string) error {
+	f.failed = append(f.failed, id)
+	return nil
+}
+func (f *fakeOutboxRepo) DeletePublishedBefore(_ context.Context, _ time.Time) (int64, error) {
+	return 0, nil
+}
+func (f *fakeOutboxRepo) CountPending(_ context.Context) (int64, error) { return 0, nil }
+
+// capturingQueue records the context handed to Enqueue.
+type capturingQueue struct {
+	gotCtx     context.Context
+	gotPayload map[string]interface{}
+}
+
+func (q *capturingQueue) Enqueue(ctx context.Context, payload map[string]interface{}) (string, error) {
+	q.gotCtx = ctx
+	q.gotPayload = payload
+	return "1-0", nil
+}
+
+func TestRelay_ReactivatesStoredTraceContext(t *testing.T) {
+	otel.SetTextMapPropagator(propagation.TraceContext{})
+
+	// Build a known producer span context and serialize it as a traceparent.
+	traceID, _ := trace.TraceIDFromHex("0af7651916cd43dd8448eb211c80319c")
+	spanID, _ := trace.SpanIDFromHex("b7ad6b7169203331")
+	sc := trace.NewSpanContext(trace.SpanContextConfig{
+		TraceID:    traceID,
+		SpanID:     spanID,
+		TraceFlags: trace.FlagsSampled,
+	})
+	carrier := propagation.MapCarrier{}
+	otel.GetTextMapPropagator().Inject(
+		trace.ContextWithSpanContext(context.Background(), sc), carrier)
+	tp := carrier.Get("traceparent")
+	if tp == "" {
+		t.Fatal("failed to build traceparent")
+	}
+
+	payload, _ := json.Marshal(map[string]interface{}{"asset_id": uuid.New().String()})
+	repo := &fakeOutboxRepo{pending: []models.OutboxEvent{
+		{ID: 7, Event: "asset_uploaded", Payload: payload, Traceparent: &tp, MaxAttempts: 5},
+	}}
+	q := &capturingQueue{}
+
+	relay := NewRelay(repo, q, zap.NewNop(), nil, time.Second, 100)
+	relay.tick(context.Background())
+
+	if q.gotCtx == nil {
+		t.Fatal("Enqueue was not called")
+	}
+	gotSC := trace.SpanContextFromContext(q.gotCtx)
+	if !gotSC.IsValid() {
+		t.Fatal("expected a valid span context passed to Enqueue")
+	}
+	if gotSC.TraceID() != traceID {
+		t.Fatalf("trace id not propagated: want %s got %s", traceID, gotSC.TraceID())
+	}
+	if len(repo.published) != 1 || repo.published[0] != 7 {
+		t.Fatalf("expected row 7 marked published, got %v", repo.published)
+	}
+}
+
+func TestRelay_NoTraceparentStillPublishes(t *testing.T) {
+	otel.SetTextMapPropagator(propagation.TraceContext{})
+
+	payload, _ := json.Marshal(map[string]interface{}{"asset_id": uuid.New().String()})
+	repo := &fakeOutboxRepo{pending: []models.OutboxEvent{
+		{ID: 9, Event: "asset_uploaded", Payload: payload, MaxAttempts: 5},
+	}}
+	q := &capturingQueue{}
+
+	relay := NewRelay(repo, q, zap.NewNop(), nil, time.Second, 100)
+	relay.tick(context.Background())
+
+	if q.gotCtx == nil {
+		t.Fatal("Enqueue was not called")
+	}
+	if len(repo.published) != 1 || repo.published[0] != 9 {
+		t.Fatalf("expected row 9 marked published, got %v", repo.published)
+	}
+}
diff --git a/internal/queue/queue.go b/internal/queue/queue.go
index 3197f11..2f4ccfc 100644
--- a/internal/queue/queue.go
+++ b/internal/queue/queue.go
@@ -15,6 +15,7 @@ import (
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/codes"
 	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/propagation"
 	"go.opentelemetry.io/otel/trace"
 )
 
@@ -126,6 +127,18 @@ func (rq *RedisQueue) Enqueue(ctx context.Context, payload map[string]interface{
 		"body": string(body),
 	}
 
+	// Inject the active trace context as top-level message fields so the worker
+	// can extract it and continue the trace across the queue boundary. The
+	// propagator writes traceparent (and tracestate when present); we copy them
+	// out as separate stream fields alongside body, preserving existing keys.
+	carrier := propagation.MapCarrier{}
+	otel.GetTextMapPropagator().Inject(ctx, carrier)
+	for _, k := range []string{"traceparent", "tracestate", "baggage"} {
+		if v := carrier.Get(k); v != "" {
+			streamEntry[k] = v
+		}
+	}
+
 	args := &redis.XAddArgs{
 		Stream: rq.options.QueueName,
 		Values: streamEntry,
diff --git a/internal/repository/api_key_repo.go b/internal/repository/api_key_repo.go
new file mode 100644
index 0000000..704a31b
--- /dev/null
+++ b/internal/repository/api_key_repo.go
@@ -0,0 +1,90 @@
+package repository
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"errors"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/jmoiron/sqlx"
+	"go.uber.org/zap"
+)
+
+// ErrAPIKeyNotFound is returned when no api_keys row matches a presented hash.
+var ErrAPIKeyNotFound = errors.New("api key not found")
+
+// APIKey is a row in the api_keys table. The plaintext key is never stored —
+// only KeyHash (SHA-256 hex) plus the public Prefix.
+type APIKey struct {
+	ID        uuid.UUID  `db:"id"`
+	TenantID  string     `db:"tenant_id"`
+	KeyHash   string     `db:"key_hash"`
+	Prefix    string     `db:"prefix"`
+	ScopesRaw []byte     `db:"scopes"`
+	ExpiresAt *time.Time `db:"expires_at"`
+	RevokedAt *time.Time `db:"revoked_at"`
+	CreatedAt time.Time  `db:"created_at"`
+}
+
+// Scopes decodes the JSONB scopes column.
+func (k *APIKey) Scopes() []string {
+	if len(k.ScopesRaw) == 0 {
+		return nil
+	}
+	var s []string
+	_ = json.Unmarshal(k.ScopesRaw, &s)
+	return s
+}
+
+type APIKeyRepository interface {
+	// Create inserts a new API key row. scopes is persisted as JSONB.
+	Create(ctx context.Context, tenantID, keyHash, prefix string, scopes []string, expiresAt *time.Time) (uuid.UUID, error)
+	// GetByHash returns the key matching keyHash, or ErrAPIKeyNotFound.
+	GetByHash(ctx context.Context, keyHash string) (*APIKey, error)
+}
+
+type apiKeyRepo struct {
+	db     *sqlx.DB
+	logger *zap.Logger
+}
+
+func NewAPIKeyRepository(db *sqlx.DB, logger *zap.Logger) APIKeyRepository {
+	return &apiKeyRepo{db: db, logger: logger}
+}
+
+func (r *apiKeyRepo) Create(ctx context.Context, tenantID, keyHash, prefix string, scopes []string, expiresAt *time.Time) (uuid.UUID, error) {
+	if scopes == nil {
+		scopes = []string{}
+	}
+	scopesJSON, err := json.Marshal(scopes)
+	if err != nil {
+		return uuid.Nil, err
+	}
+	var id uuid.UUID
+	err = r.db.QueryRowxContext(ctx,
+		`INSERT INTO api_keys (tenant_id, key_hash, prefix, scopes, expires_at)
+		 VALUES ($1, $2, $3, $4::jsonb, $5)
+		 RETURNING id`,
+		tenantID, keyHash, prefix, scopesJSON, expiresAt,
+	).Scan(&id)
+	if err != nil {
+		return uuid.Nil, err
+	}
+	return id, nil
+}
+
+func (r *apiKeyRepo) GetByHash(ctx context.Context, keyHash string) (*APIKey, error) {
+	var k APIKey
+	err := r.db.GetContext(ctx, &k,
+		`SELECT id, tenant_id, key_hash, prefix, scopes, expires_at, revoked_at, created_at
+		 FROM api_keys WHERE key_hash = $1`, keyHash)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, ErrAPIKeyNotFound
+	}
+	if err != nil {
+		return nil, err
+	}
+	return &k, nil
+}
diff --git a/internal/repository/asset_repo.go b/internal/repository/asset_repo.go
index ed280dc..e638739 100644
--- a/internal/repository/asset_repo.go
+++ b/internal/repository/asset_repo.go
@@ -93,8 +93,9 @@ func ToAssetTypeFromMimeType(mimeType string) AssetType {
 type AssetRepository interface {
 	CreateAsset(ctx context.Context, id uuid.UUID, url string, size int64, fileType AssetType, mimeType string, ownerID string) error
 	CreateAssetTx(ctx context.Context, tx *sql.Tx, id uuid.UUID, url string, size int64, fileType AssetType, mimeType string, ownerID string) error
-	MarkAssetUploadedTx(ctx context.Context, tx *sql.Tx, id uuid.UUID) (bool, error)
+	MarkAssetUploadedTx(ctx context.Context, tx *sql.Tx, id uuid.UUID, tenantID string) (MarkResult, error)
 	InsertProcessAssetJobTx(ctx context.Context, tx *sql.Tx, assetID uuid.UUID) (*int64, error)
+	CountByOwner(ctx context.Context, tenantID string) (int64, error)
 	GetDB() *sqlx.DB
 }
 
@@ -112,6 +113,17 @@ func (r *assetRepo) GetDB() *sqlx.DB {
 	return r.db
 }
 
+// CountByOwner returns how many assets the tenant currently owns — used for
+// per-tenant usage quota enforcement.
+func (r *assetRepo) CountByOwner(ctx context.Context, tenantID string) (int64, error) {
+	var count int64
+	err := r.db.GetContext(ctx, &count, `SELECT COUNT(*) FROM assets WHERE owner_id = $1`, tenantID)
+	if err != nil {
+		return 0, appErrors.NewInternalServerError("Could not count tenant assets", err)
+	}
+	return count, nil
+}
+
 func (r *assetRepo) CreateAsset(ctx context.Context, id uuid.UUID, url string, size int64, fileType AssetType, mimeType string, ownerID string) error {
 	start := time.Now()
 	query := `INSERT INTO assets (asset_id, original_url, type, mime_type, status, size_bytes, owner_id) VALUES ($1, $2, $3, $4, $5, $6, $7);`
@@ -192,29 +204,58 @@ func (r *assetRepo) CreateAssetTx(ctx context.Context, tx *sql.Tx, id uuid.UUID,
 	return nil
 }
 
-func (r *assetRepo) MarkAssetUploadedTx(ctx context.Context, tx *sql.Tx, id uuid.UUID) (bool, error) {
-	query := `UPDATE assets SET status = $1, updated_at = NOW() WHERE asset_id = $2 AND status = 'uploading';`
-	res, err := tx.ExecContext(
-		ctx,
-		query,
-		StatusUploaded,
-		id,
-	)
+// MarkResult is the outcome of attempting to mark an asset uploaded, scoped to
+// the calling tenant.
+type MarkResult int
+
+const (
+	// MarkUpdated: the asset transitioned uploading -> uploaded.
+	MarkUpdated MarkResult = iota
+	// MarkAlreadyUploaded: the asset exists and is owned by the tenant, but was
+	// not in the 'uploading' state (idempotent no-op).
+	MarkAlreadyUploaded
+	// MarkNotFound: no asset with that id is owned by the tenant. Callers map
+	// this to 404 so a tenant cannot probe another tenant's asset ids (IDOR).
+	MarkNotFound
+)
+
+func (r *assetRepo) MarkAssetUploadedTx(ctx context.Context, tx *sql.Tx, id uuid.UUID, tenantID string) (MarkResult, error) {
+	// Owner-scoped update: the asset only transitions if it belongs to the
+	// caller. This is the IDOR guard — without owner_id in the WHERE clause a
+	// tenant could complete another tenant's asset by id.
+	query := `UPDATE assets SET status = $1, updated_at = NOW()
+			  WHERE asset_id = $2 AND owner_id = $3 AND status = 'uploading';`
+	res, err := tx.ExecContext(ctx, query, StatusUploaded, id, tenantID)
 	if err != nil {
 		r.logger.Sugar().Errorf("Failed to mark asset as uploaded in transaction: %v", err)
-		return false, appErrors.NewInternalServerError("Could not update row in transaction", err)
+		return MarkNotFound, appErrors.NewInternalServerError("Could not update row in transaction", err)
 	}
 
 	rowsAffected, err := res.RowsAffected()
 	if err != nil {
 		r.logger.Sugar().Errorf("Failed to get rows affected: %v", err)
-		return false, appErrors.NewInternalServerError("Could not get rows affected in transaction", err)
+		return MarkNotFound, appErrors.NewInternalServerError("Could not get rows affected in transaction", err)
 	}
 
-	if rowsAffected == 0 {
-		return false, nil // No rows updated, asset might not be in 'uploading' state
+	if rowsAffected > 0 {
+		return MarkUpdated, nil
+	}
+
+	// Zero rows updated: disambiguate "exists and owned but not uploading"
+	// (idempotent success) from "absent or not owned" (404). The existence
+	// probe is also owner-scoped, so cross-tenant ids are reported as not found.
+	var exists bool
+	if err := tx.QueryRowContext(ctx,
+		`SELECT EXISTS(SELECT 1 FROM assets WHERE asset_id = $1 AND owner_id = $2)`,
+		id, tenantID,
+	).Scan(&exists); err != nil {
+		r.logger.Sugar().Errorf("Failed to check asset ownership: %v", err)
+		return MarkNotFound, appErrors.NewInternalServerError("Could not verify asset ownership", err)
+	}
+	if exists {
+		return MarkAlreadyUploaded, nil
 	}
-	return true, nil // Asset marked as uploaded successfully
+	return MarkNotFound, nil
 }
 
 func (r *assetRepo) InsertProcessAssetJobTx(ctx context.Context, tx *sql.Tx, assetID uuid.UUID) (*int64, error) {
diff --git a/internal/repository/asset_scoping_integration_test.go b/internal/repository/asset_scoping_integration_test.go
new file mode 100644
index 0000000..79562bd
--- /dev/null
+++ b/internal/repository/asset_scoping_integration_test.go
@@ -0,0 +1,125 @@
+//go:build integration
+
+package repository_test
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/jmoiron/sqlx"
+	_ "github.com/lib/pq"
+	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	"github.com/testcontainers/testcontainers-go"
+	tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
+	"github.com/testcontainers/testcontainers-go/wait"
+	"go.uber.org/zap"
+)
+
+func setupAssetsDB(t *testing.T, ctx context.Context) *sqlx.DB {
+	t.Helper()
+	pg, err := tcpostgres.Run(ctx, "postgres:16-alpine",
+		tcpostgres.WithDatabase("testdb"),
+		tcpostgres.WithUsername("test"),
+		tcpostgres.WithPassword("test"),
+		testcontainers.WithWaitStrategy(wait.ForListeningPort("5432/tcp").WithStartupTimeout(30*time.Second)),
+	)
+	if err != nil {
+		t.Fatalf("start postgres container: %v", err)
+	}
+	t.Cleanup(func() { _ = pg.Terminate(ctx) })
+
+	dsn, err := pg.ConnectionString(ctx, "sslmode=disable")
+	if err != nil {
+		t.Fatalf("connection string: %v", err)
+	}
+	db, err := sqlx.Connect("postgres", dsn)
+	if err != nil {
+		t.Fatalf("connect: %v", err)
+	}
+	t.Cleanup(func() { _ = db.Close() })
+
+	for _, ddl := range []string{
+		`CREATE EXTENSION IF NOT EXISTS "uuid-ossp"`,
+		`CREATE TABLE assets (
+			asset_id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+			original_url TEXT NOT NULL, type TEXT NOT NULL, status TEXT NOT NULL,
+			mime_type TEXT NOT NULL, size_bytes BIGINT NOT NULL, owner_id TEXT NOT NULL,
+			created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW())`,
+	} {
+		if _, err := db.Exec(ddl); err != nil {
+			t.Fatalf("ddl: %v", err)
+		}
+	}
+	return db
+}
+
+func statusOf(t *testing.T, db *sqlx.DB, id uuid.UUID) string {
+	t.Helper()
+	var s string
+	if err := db.Get(&s, `SELECT status FROM assets WHERE asset_id = $1`, id); err != nil {
+		t.Fatalf("read status: %v", err)
+	}
+	return s
+}
+
+func mark(t *testing.T, db *sqlx.DB, repo repository.AssetRepository, id uuid.UUID, tenant string) repository.MarkResult {
+	t.Helper()
+	ctx := context.Background()
+	tx, err := db.BeginTx(ctx, nil)
+	if err != nil {
+		t.Fatalf("begin tx: %v", err)
+	}
+	res, err := repo.MarkAssetUploadedTx(ctx, tx, id, tenant)
+	if err != nil {
+		_ = tx.Rollback()
+		t.Fatalf("MarkAssetUploadedTx: %v", err)
+	}
+	if err := tx.Commit(); err != nil {
+		t.Fatalf("commit: %v", err)
+	}
+	return res
+}
+
+// TestMarkAssetUploadedTx_TenantScoping is the IDOR regression test: a tenant
+// must not be able to complete another tenant's asset by id.
+func TestMarkAssetUploadedTx_TenantScoping(t *testing.T) {
+	ctx := context.Background()
+	db := setupAssetsDB(t, ctx)
+	repo := repository.NewAssetRepository(db, zap.NewNop(), nil)
+
+	id := uuid.New()
+	if _, err := db.Exec(
+		`INSERT INTO assets (asset_id, original_url, type, status, mime_type, size_bytes, owner_id)
+		 VALUES ($1, 'http://x/raw', 'image', 'uploading', 'image/jpeg', 1, 'tenant-a')`, id,
+	); err != nil {
+		t.Fatalf("seed asset: %v", err)
+	}
+
+	// tenant-b must NOT be able to complete tenant-a's asset.
+	if got := mark(t, db, repo, id, "tenant-b"); got != repository.MarkNotFound {
+		t.Errorf("cross-tenant mark = %v, want MarkNotFound", got)
+	}
+	if s := statusOf(t, db, id); s != "uploading" {
+		t.Errorf("status after cross-tenant attempt = %q, want still 'uploading'", s)
+	}
+
+	// tenant-a completes its own asset.
+	if got := mark(t, db, repo, id, "tenant-a"); got != repository.MarkUpdated {
+		t.Errorf("owner mark = %v, want MarkUpdated", got)
+	}
+	if s := statusOf(t, db, id); s != "uploaded" {
+		t.Errorf("status after owner mark = %q, want 'uploaded'", s)
+	}
+
+	// Re-marking by the owner is an idempotent no-op (already uploaded).
+	if got := mark(t, db, repo, id, "tenant-a"); got != repository.MarkAlreadyUploaded {
+		t.Errorf("repeat owner mark = %v, want MarkAlreadyUploaded", got)
+	}
+
+	// A completely unknown id is not found (for any tenant).
+	if got := mark(t, db, repo, uuid.New(), "tenant-a"); got != repository.MarkNotFound {
+		t.Errorf("unknown id mark = %v, want MarkNotFound", got)
+	}
+}
diff --git a/internal/repository/idempotency_repo.go b/internal/repository/idempotency_repo.go
new file mode 100644
index 0000000..ff299b7
--- /dev/null
+++ b/internal/repository/idempotency_repo.go
@@ -0,0 +1,149 @@
+package repository
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"time"
+
+	"github.com/jmoiron/sqlx"
+	"go.uber.org/zap"
+)
+
+// AcquireOutcome is the result of trying to claim an idempotency key.
+type AcquireOutcome int
+
+const (
+	// AcquireAcquired: the caller won the lock and must execute the handler.
+	AcquireAcquired AcquireOutcome = iota
+	// AcquireReplay: a completed response exists for this key — replay it.
+	AcquireReplay
+	// AcquireMismatch: the key was used before with a different request body.
+	AcquireMismatch
+	// AcquireInFlight: a concurrent request holds the key but hasn't finished.
+	AcquireInFlight
+)
+
+// IdempotencyRecord carries a stored response for replay.
+type IdempotencyRecord struct {
+	ResponseStatus int
+	ResponseBody   []byte
+}
+
+type IdempotencyRepository interface {
+	// Acquire attempts to claim (tenant, key) for fingerprint. It is
+	// concurrency-safe: exactly one concurrent caller receives AcquireAcquired.
+	Acquire(ctx context.Context, tenant, key, fingerprint string, ttl time.Duration) (AcquireOutcome, *IdempotencyRecord, error)
+	// Complete stores the final response and marks the key done.
+	Complete(ctx context.Context, tenant, key string, status int, body []byte) error
+	// Release deletes a still-pending key so the client can retry (used when
+	// the handler produced a non-cacheable error, e.g. 5xx).
+	Release(ctx context.Context, tenant, key string) error
+	// DeleteExpired purges keys past their TTL. Returns rows deleted.
+	DeleteExpired(ctx context.Context) (int64, error)
+}
+
+type idempotencyRepo struct {
+	db     *sqlx.DB
+	logger *zap.Logger
+}
+
+func NewIdempotencyRepository(db *sqlx.DB, logger *zap.Logger) IdempotencyRepository {
+	return &idempotencyRepo{db: db, logger: logger}
+}
+
+func (r *idempotencyRepo) Acquire(ctx context.Context, tenant, key, fingerprint string, ttl time.Duration) (AcquireOutcome, *IdempotencyRecord, error) {
+	expiresAt := time.Now().Add(ttl)
+
+	// 1. Fresh claim. ON CONFLICT DO NOTHING is atomic: only one concurrent
+	//    INSERT for the same (tenant, key) affects a row.
+	res, err := r.db.ExecContext(ctx,
+		`INSERT INTO idempotency_keys (tenant_id, key, request_fingerprint, status, expires_at)
+		 VALUES ($1, $2, $3, 'pending', $4)
+		 ON CONFLICT (tenant_id, key) DO NOTHING`,
+		tenant, key, fingerprint, expiresAt,
+	)
+	if err != nil {
+		return AcquireInFlight, nil, err
+	}
+	if n, _ := res.RowsAffected(); n == 1 {
+		return AcquireAcquired, nil, nil
+	}
+
+	// 2. A row already exists. If it is expired, atomically take it over
+	//    (resetting it to pending). Only one concurrent UPDATE matches, since
+	//    after it runs expires_at is in the future.
+	res, err = r.db.ExecContext(ctx,
+		`UPDATE idempotency_keys
+		 SET request_fingerprint = $3, status = 'pending',
+		     response_status = NULL, response_body = NULL, asset_id = NULL,
+		     created_at = NOW(), expires_at = $4
+		 WHERE tenant_id = $1 AND key = $2 AND expires_at <= NOW()`,
+		tenant, key, fingerprint, expiresAt,
+	)
+	if err != nil {
+		return AcquireInFlight, nil, err
+	}
+	if n, _ := res.RowsAffected(); n == 1 {
+		return AcquireAcquired, nil, nil
+	}
+
+	// 3. A live row exists — decide replay / mismatch / in-flight.
+	var (
+		storedFingerprint string
+		status            string
+		respStatus        sql.NullInt64
+		respBody          []byte
+	)
+	err = r.db.QueryRowxContext(ctx,
+		`SELECT request_fingerprint, status, response_status, response_body
+		 FROM idempotency_keys WHERE tenant_id = $1 AND key = $2`,
+		tenant, key,
+	).Scan(&storedFingerprint, &status, &respStatus, &respBody)
+	if errors.Is(err, sql.ErrNoRows) {
+		// Raced with an expiry/sweep; treat as in-flight so the client retries.
+		return AcquireInFlight, nil, nil
+	}
+	if err != nil {
+		return AcquireInFlight, nil, err
+	}
+
+	if storedFingerprint != fingerprint {
+		return AcquireMismatch, nil, nil
+	}
+	if status == "done" {
+		return AcquireReplay, &IdempotencyRecord{
+			ResponseStatus: int(respStatus.Int64),
+			ResponseBody:   respBody,
+		}, nil
+	}
+	return AcquireInFlight, nil, nil
+}
+
+func (r *idempotencyRepo) Complete(ctx context.Context, tenant, key string, status int, body []byte) error {
+	_, err := r.db.ExecContext(ctx,
+		`UPDATE idempotency_keys
+		 SET status = 'done', response_status = $3, response_body = $4::jsonb
+		 WHERE tenant_id = $1 AND key = $2`,
+		tenant, key, status, body,
+	)
+	return err
+}
+
+func (r *idempotencyRepo) Release(ctx context.Context, tenant, key string) error {
+	_, err := r.db.ExecContext(ctx,
+		`DELETE FROM idempotency_keys WHERE tenant_id = $1 AND key = $2 AND status = 'pending'`,
+		tenant, key,
+	)
+	return err
+}
+
+func (r *idempotencyRepo) DeleteExpired(ctx context.Context) (int64, error) {
+	res, err := r.db.ExecContext(ctx,
+		`DELETE FROM idempotency_keys WHERE expires_at <= NOW()`)
+	if err != nil {
+		return 0, err
+	}
+	n, _ := res.RowsAffected()
+	return n, nil
+}
diff --git a/internal/repository/idempotency_scoping_integration_test.go b/internal/repository/idempotency_scoping_integration_test.go
new file mode 100644
index 0000000..f58c08c
--- /dev/null
+++ b/internal/repository/idempotency_scoping_integration_test.go
@@ -0,0 +1,148 @@
+//go:build integration
+
+package repository_test
+
+import (
+	"context"
+	"encoding/json"
+	"reflect"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/jmoiron/sqlx"
+	_ "github.com/lib/pq"
+	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	"github.com/testcontainers/testcontainers-go"
+	tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
+	"github.com/testcontainers/testcontainers-go/wait"
+	"go.uber.org/zap"
+)
+
+func setupIdempotencyDB(t *testing.T, ctx context.Context) *sqlx.DB {
+	t.Helper()
+	pg, err := tcpostgres.Run(ctx, "postgres:16-alpine",
+		tcpostgres.WithDatabase("testdb"),
+		tcpostgres.WithUsername("test"),
+		tcpostgres.WithPassword("test"),
+		testcontainers.WithWaitStrategy(wait.ForListeningPort("5432/tcp").WithStartupTimeout(30*time.Second)),
+	)
+	if err != nil {
+		t.Fatalf("start postgres: %v", err)
+	}
+	t.Cleanup(func() { _ = pg.Terminate(ctx) })
+
+	dsn, err := pg.ConnectionString(ctx, "sslmode=disable")
+	if err != nil {
+		t.Fatalf("dsn: %v", err)
+	}
+	db, err := sqlx.Connect("postgres", dsn)
+	if err != nil {
+		t.Fatalf("connect: %v", err)
+	}
+	t.Cleanup(func() { _ = db.Close() })
+
+	if _, err := db.Exec(`CREATE TABLE idempotency_keys (
+		tenant_id TEXT NOT NULL, key TEXT NOT NULL, request_fingerprint TEXT NOT NULL,
+		status TEXT NOT NULL DEFAULT 'pending', response_status INT, response_body JSONB,
+		asset_id UUID, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), expires_at TIMESTAMPTZ NOT NULL,
+		PRIMARY KEY (tenant_id, key))`); err != nil {
+		t.Fatalf("ddl: %v", err)
+	}
+	return db
+}
+
+// TestIdempotency_ConcurrentAcquire_SingleWinner is the no-race-dupes test:
+// many concurrent requests with the same (tenant, key, fingerprint) must yield
+// exactly one AcquireAcquired; the rest see AcquireInFlight.
+func TestIdempotency_ConcurrentAcquire_SingleWinner(t *testing.T) {
+	ctx := context.Background()
+	db := setupIdempotencyDB(t, ctx)
+	repo := repository.NewIdempotencyRepository(db, zap.NewNop())
+
+	const n = 20
+	var (
+		wg        sync.WaitGroup
+		mu        sync.Mutex
+		acquired  int
+		inflight  int
+		otherSeen int
+	)
+	wg.Add(n)
+	for i := 0; i < n; i++ {
+		go func() {
+			defer wg.Done()
+			outcome, _, err := repo.Acquire(ctx, "tenant-a", "key-1", "fp-1", time.Hour)
+			if err != nil {
+				t.Errorf("Acquire: %v", err)
+				return
+			}
+			mu.Lock()
+			switch outcome {
+			case repository.AcquireAcquired:
+				acquired++
+			case repository.AcquireInFlight:
+				inflight++
+			default:
+				otherSeen++
+			}
+			mu.Unlock()
+		}()
+	}
+	wg.Wait()
+
+	if acquired != 1 {
+		t.Errorf("acquired = %d, want exactly 1", acquired)
+	}
+	if inflight != n-1 {
+		t.Errorf("inflight = %d, want %d", inflight, n-1)
+	}
+	if otherSeen != 0 {
+		t.Errorf("unexpected outcomes = %d, want 0", otherSeen)
+	}
+}
+
+// TestIdempotency_ReplayAndMismatch covers the done-replay and
+// different-fingerprint paths.
+func TestIdempotency_ReplayAndMismatch(t *testing.T) {
+	ctx := context.Background()
+	db := setupIdempotencyDB(t, ctx)
+	repo := repository.NewIdempotencyRepository(db, zap.NewNop())
+
+	// First acquire wins.
+	if o, _, _ := repo.Acquire(ctx, "t", "k", "fp", time.Hour); o != repository.AcquireAcquired {
+		t.Fatalf("first acquire = %v, want Acquired", o)
+	}
+	// Same key while pending -> in flight.
+	if o, _, _ := repo.Acquire(ctx, "t", "k", "fp", time.Hour); o != repository.AcquireInFlight {
+		t.Errorf("pending re-acquire = %v, want InFlight", o)
+	}
+	// Complete, then a matching replay returns the stored response.
+	if err := repo.Complete(ctx, "t", "k", 201, []byte(`{"ok":true}`)); err != nil {
+		t.Fatalf("Complete: %v", err)
+	}
+	o, rec, _ := repo.Acquire(ctx, "t", "k", "fp", time.Hour)
+	if o != repository.AcquireReplay {
+		t.Fatalf("post-complete acquire = %v, want Replay", o)
+	}
+	// Postgres JSONB normalizes whitespace on storage (e.g. `{"ok":true}` is
+	// re-serialized as `{"ok": true}`), so compare semantic JSON rather than
+	// raw bytes — the replay contract is content-equality, not byte equality.
+	if rec == nil || rec.ResponseStatus != 201 {
+		t.Fatalf("replay record = %+v", rec)
+	}
+	var got, want any
+	if err := json.Unmarshal(rec.ResponseBody, &got); err != nil {
+		t.Fatalf("unmarshal stored body: %v (raw=%q)", err, rec.ResponseBody)
+	}
+	if err := json.Unmarshal([]byte(`{"ok":true}`), &want); err != nil {
+		t.Fatalf("unmarshal expected: %v", err)
+	}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("replay body = %s, want {\"ok\":true}", rec.ResponseBody)
+	}
+	// Same key, different fingerprint -> mismatch.
+	if o, _, _ := repo.Acquire(ctx, "t", "k", "different-fp", time.Hour); o != repository.AcquireMismatch {
+		t.Errorf("different-fingerprint acquire = %v, want Mismatch", o)
+	}
+}
diff --git a/internal/repository/outbox_repo.go b/internal/repository/outbox_repo.go
index 2ed7d3b..2676bb2 100644
--- a/internal/repository/outbox_repo.go
+++ b/internal/repository/outbox_repo.go
@@ -32,8 +32,8 @@ func NewOutboxRepository(db *sqlx.DB, logger *zap.Logger) OutboxRepository {
 
 func (r *outboxRepo) InsertTx(ctx context.Context, tx *sql.Tx, event models.OutboxEvent) error {
 	_, err := tx.ExecContext(ctx,
-		`INSERT INTO event_outbox (aggregate_id, job_id, event, payload, max_attempts) VALUES ($1, $2, $3, $4, $5)`,
-		event.AggregateID, event.JobID, event.Event, event.Payload, event.MaxAttempts,
+		`INSERT INTO event_outbox (aggregate_id, job_id, event, payload, traceparent, max_attempts) VALUES ($1, $2, $3, $4, $5, $6)`,
+		event.AggregateID, event.JobID, event.Event, event.Payload, event.Traceparent, event.MaxAttempts,
 	)
 	return err
 }
@@ -41,7 +41,7 @@ func (r *outboxRepo) InsertTx(ctx context.Context, tx *sql.Tx, event models.Outb
 func (r *outboxRepo) FetchPendingBatch(ctx context.Context, limit int) ([]models.OutboxEvent, error) {
 	var rows []models.OutboxEvent
 	err := r.db.SelectContext(ctx, &rows,
-		`SELECT id, aggregate_id, job_id, event, payload, status, attempts, max_attempts, last_error, created_at, published_at
+		`SELECT id, aggregate_id, job_id, event, payload, traceparent, status, attempts, max_attempts, last_error, created_at, published_at
 		 FROM event_outbox WHERE status = 'pending' ORDER BY id LIMIT $1 FOR UPDATE SKIP LOCKED`, limit)
 	return rows, err
 }
diff --git a/internal/router/router.go b/internal/router/router.go
index 7d44fea..ef66fb0 100644
--- a/internal/router/router.go
+++ b/internal/router/router.go
@@ -3,8 +3,6 @@ package router
 import (
 	"math/rand"
 	"net/http"
-	"strings"
-	"sync"
 	"time"
 
 	"github.com/go-chi/chi/v5"
@@ -13,13 +11,12 @@ import (
 	"github.com/jmoiron/sqlx"
 	"github.com/rndmcodeguy20/mpiper/internal/config"
 	"github.com/rndmcodeguy20/mpiper/internal/handler"
-	appMiddleware "github.com/rndmcodeguy20/mpiper/internal/middleware"
 	"github.com/rndmcodeguy20/mpiper/internal/metrics"
+	appMiddleware "github.com/rndmcodeguy20/mpiper/internal/middleware"
 	"github.com/rndmcodeguy20/mpiper/internal/repository"
 	"github.com/rndmcodeguy20/mpiper/internal/service"
 	applogger "github.com/rndmcodeguy20/mpiper/pkg/logger"
 	"github.com/rndmcodeguy20/mpiper/pkg/utils"
-	"golang.org/x/time/rate"
 )
 
 const (
@@ -27,56 +24,8 @@ const (
 	MiddlewareTimeout = 30 * time.Second
 )
 
-// presignRateLimiter returns a per-IP rate-limit middleware.
-// Each IP is allowed 10 requests/s with a burst of 20.
-func presignRateLimiter() func(http.Handler) http.Handler {
-	type entry struct {
-		lim      *rate.Limiter
-		lastSeen time.Time
-	}
-	var (
-		mu      sync.Mutex
-		clients = make(map[string]*entry)
-	)
-	// Evict IPs not seen in the last 5 minutes to prevent unbounded growth.
-	go func() {
-		for range time.Tick(time.Minute) {
-			mu.Lock()
-			for ip, e := range clients {
-				if time.Since(e.lastSeen) > 5*time.Minute {
-					delete(clients, ip)
-				}
-			}
-			mu.Unlock()
-		}
-	}()
-
-	getLimiter := func(ip string) *rate.Limiter {
-		mu.Lock()
-		defer mu.Unlock()
-		e, ok := clients[ip]
-		if !ok {
-			e = &entry{lim: rate.NewLimiter(rate.Limit(10), 20)}
-			clients[ip] = e
-		}
-		e.lastSeen = time.Now()
-		return e.lim
-	}
-
-	return func(next http.Handler) http.Handler {
-		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			ip := r.RemoteAddr
-			if xff := r.Header.Get("X-Forwarded-For"); xff != "" {
-				ip = strings.SplitN(xff, ",", 2)[0]
-			}
-			if !getLimiter(strings.TrimSpace(ip)).Allow() {
-				http.Error(w, `{"status":"error","message":"rate limit exceeded"}`, http.StatusTooManyRequests)
-				return
-			}
-			next.ServeHTTP(w, r)
-		})
-	}
-}
+// presignRateLimiter removed: per-tenant rate limiting now lives in
+// middleware.TenantRateLimitMiddleware (keyed by tenant id, not IP).
 
 func NewRouter(cfg config.EnvConfig, db *sqlx.DB, m *metrics.Metrics) *chi.Mux {
 	r := chi.NewRouter()
@@ -105,9 +54,9 @@ func NewRouter(cfg config.EnvConfig, db *sqlx.DB, m *metrics.Metrics) *chi.Mux {
 		AllowCredentials: true,
 		MaxAge:           300,
 	}))
+	r.Use(appMiddleware.TracingMiddleware)
 	r.Use(appMiddleware.LoggerMiddleware(logger))
 	r.Use(middleware.Timeout(MiddlewareTimeout))
-	r.Use(appMiddleware.TracingMiddleware)
 	r.Use(appMiddleware.MetricsMiddleware(m))
 	r.Use(middleware.Compress(5))
 	r.Use(appMiddleware.SlowRequestMiddleware(logger, 2*time.Second))
@@ -116,6 +65,9 @@ func NewRouter(cfg config.EnvConfig, db *sqlx.DB, m *metrics.Metrics) *chi.Mux {
 	outboxRepo := repository.NewOutboxRepository(db, logger)
 	assetSvc := service.NewAssetService(assetRepo, outboxRepo, logger, m)
 	assetHandler := handler.NewAssetHandler(assetSvc, logger, m)
+	apiKeyRepo := repository.NewAPIKeyRepository(db, logger)
+	idempotencyRepo := repository.NewIdempotencyRepository(db, logger)
+	idempotencyTTL := config.MustGet().IdempotencyTTL
 
 	// Routes
 	r.Get("/", func(w http.ResponseWriter, r *http.Request) {
@@ -148,17 +100,22 @@ func NewRouter(cfg config.EnvConfig, db *sqlx.DB, m *metrics.Metrics) *chi.Mux {
 		})
 
 		r.Route("/storage", func(r chi.Router) {
-			r.Use(appMiddleware.AuthMiddleware(logger))
-			r.With(presignRateLimiter()).Post("/presign", assetHandler.CreateAsset)
+			quotaCfg := config.MustGet().Quota
+			r.Use(appMiddleware.AuthMiddleware(logger, apiKeyRepo))
+			r.Use(appMiddleware.IdempotencyMiddleware(logger, idempotencyRepo, idempotencyTTL))
+			r.Use(appMiddleware.TenantRateLimitMiddleware(logger, m, quotaCfg.RateLimitRPS, quotaCfg.RateLimitBurst))
+			r.Use(appMiddleware.TenantQuotaMiddleware(logger, m, assetRepo, quotaCfg.AssetQuota))
+			r.Post("/presign", assetHandler.CreateAsset)
 		})
 
 		r.Route("/assets", func(r chi.Router) {
-			r.Use(appMiddleware.AuthMiddleware(logger))
+			r.Use(appMiddleware.AuthMiddleware(logger, apiKeyRepo))
+			r.Use(appMiddleware.IdempotencyMiddleware(logger, idempotencyRepo, idempotencyTTL))
 			r.Get("/{assetID}/complete", assetHandler.MarkAssetUploaded)
 		})
 
 		r.Route("/webhooks", func(r chi.Router) {
-			r.Use(appMiddleware.AuthMiddleware(logger))
+			r.Use(appMiddleware.AuthMiddleware(logger, apiKeyRepo))
 			webhookRepo := repository.NewWebhookRepository(db, logger)
 			webhookSvc := service.NewWebhookService(webhookRepo, logger)
 			webhookHandler := handler.NewWebhookHandler(webhookSvc, logger)
diff --git a/internal/service/asset.go b/internal/service/asset.go
index 86bb684..d2f4691 100644
--- a/internal/service/asset.go
+++ b/internal/service/asset.go
@@ -14,11 +14,14 @@ import (
 	"github.com/rndmcodeguy20/mpiper/internal/middleware"
 	"github.com/rndmcodeguy20/mpiper/internal/models"
 	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	apperrors "github.com/rndmcodeguy20/mpiper/pkg/errors"
 	"github.com/rndmcodeguy20/mpiper/pkg/utils/storagex"
+	"github.com/rndmcodeguy20/mpiper/pkg/utils/tenant"
 	"go.opentelemetry.io/otel"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/codes"
 	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/propagation"
 	"go.uber.org/zap"
 )
 
@@ -86,7 +89,12 @@ func (s *assetService) CreateAsset(ctx context.Context, request models.UploadAss
 		attribute.Int64("content_length", request.Size),
 	)
 
-	objectKey := fmt.Sprintf("media/raw/%s", assetID)
+	objectKey, err := rawObjectKey(ctx, assetID)
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, "Failed to derive tenant-scoped object key")
+		return nil, err
+	}
 
 	spanStorageCtx, spanStorage := tracer.Start(ctx, "StorageClient.GeneratePresignedURL")
 	spanStorage.SetAttributes(attribute.String("object_key", objectKey))
@@ -124,7 +132,7 @@ func (s *assetService) CreateAsset(ctx context.Context, request models.UploadAss
 
 	spanStorageCtx, spanStorage = tracer.Start(ctx, "AssetRepo.CreateAsset")
 	spanStorage.SetAttributes(attribute.String("asset_id", assetID.String()))
-	ownerID, _ := middleware.GetUserID(ctx)
+	ownerID, _ := middleware.GetTenant(ctx)
 	err = s.assetRepo.CreateAsset(spanStorageCtx, assetID, publicUrl, request.Size, repository.ToAssetTypeFromMimeType(request.ContentType), request.ContentType, ownerID)
 	spanStorage.End()
 
@@ -175,13 +183,24 @@ func (s *assetService) MarkAssetUploaded(ctx context.Context, assetID uuid.UUID)
 
 	span.SetAttributes(attribute.String("asset_id", assetID.String()))
 
-	// check if asset exists
-	objectKey := fmt.Sprintf("media/raw/%s", assetID)
-	span.SetAttributes(attribute.String("object_key", objectKey))
+	// rawObjectKey validates the tenant and builds the tenant-scoped key.
+	// It returns UnauthorizedError when the tenant is missing from context
+	// and BadRequestError when the slug is malformed.
+	objectKey, err := rawObjectKey(ctx, assetID)
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, "Failed to derive tenant-scoped object key")
+		return err
+	}
+
+	// rawObjectKey already validated the tenant, so re-extract it only for
+	// span attributes and the repo update.
+	tenantID, _ := middleware.GetTenant(ctx)
+	span.SetAttributes(attribute.String("tenant", tenantID), attribute.String("object_key", objectKey))
 
 	ctxStorage, spanStorage := tracer.Start(ctx, "StorageClient.GetObjectAttrs")
 	spanStorage.SetAttributes(attribute.String("object_key", objectKey))
-	_, err := s.storageClient.GetObjectAttrs(ctxStorage, s.bucket, objectKey)
+	_, err = s.storageClient.GetObjectAttrs(ctxStorage, s.bucket, objectKey)
 	spanStorage.End()
 
 	if err != nil {
@@ -215,7 +234,7 @@ func (s *assetService) MarkAssetUploaded(ctx context.Context, assetID uuid.UUID)
 
 	ctxUpdate, spanUpdate := tracer.Start(ctxTx, "AssetRepo.MarkAssetUploadedTx")
 	spanUpdate.SetAttributes(attribute.String("asset_id", assetID.String()))
-	changed, err := s.assetRepo.MarkAssetUploadedTx(ctxUpdate, tx, assetID)
+	result, err := s.assetRepo.MarkAssetUploadedTx(ctxUpdate, tx, assetID, tenantID)
 	spanUpdate.End()
 
 	if err != nil {
@@ -226,7 +245,14 @@ func (s *assetService) MarkAssetUploaded(ctx context.Context, assetID uuid.UUID)
 		return err
 	}
 
-	if !changed {
+	switch result {
+	case repository.MarkNotFound:
+		// Absent or owned by another tenant — return 404 (no existence leak).
+		spanUpdate.AddEvent("Asset not found for tenant")
+		span.SetStatus(codes.Error, "Asset not found for tenant")
+		s.logger.Sugar().Infof("Asset %s not found for tenant %s", assetID, tenantID)
+		return apperrors.NewNotFoundError("Asset not found", nil)
+	case repository.MarkAlreadyUploaded:
 		spanUpdate.AddEvent("Asset already uploaded")
 		span.AddEvent("Asset already in uploaded state")
 		s.logger.Sugar().Infof("Asset %s already marked as uploaded", assetID)
@@ -258,11 +284,23 @@ func (s *assetService) MarkAssetUploaded(ctx context.Context, assetID uuid.UUID)
 		"event":     "asset_uploaded",
 		"timestamp": time.Now().UTC().Format(time.RFC3339),
 	})
+
+	// Capture the active trace context so it survives the outbox store-and-forward
+	// hop. The relay (running on a background ticker context) re-activates this
+	// before publishing to Redis, keeping the whole pipeline in one trace.
+	carrier := propagation.MapCarrier{}
+	otel.GetTextMapPropagator().Inject(ctxOutbox, carrier)
+	var traceparent *string
+	if tp := carrier.Get("traceparent"); tp != "" {
+		traceparent = &tp
+	}
+
 	err = s.outboxRepo.InsertTx(ctxOutbox, tx, models.OutboxEvent{
 		AggregateID: assetID,
 		JobID:       jobID,
 		Event:       "asset_uploaded",
 		Payload:     payload,
+		Traceparent: traceparent,
 	})
 	spanOutbox.End()
 
@@ -306,3 +344,17 @@ func (s *assetService) MarkAssetUploaded(ctx context.Context, assetID uuid.UUID)
 	span.SetStatus(codes.Ok, "Asset marked as uploaded and outbox event created")
 	return nil
 }
+
+// rawObjectKey builds the tenant-scoped storage key for an asset's raw upload.
+// It centralizes tenant extraction and validation so that CreateAsset and
+// MarkAssetUploaded stay consistent.
+func rawObjectKey(ctx context.Context, assetID uuid.UUID) (string, error) {
+	tenantID, ok := middleware.GetTenant(ctx)
+	if !ok || tenantID == "" {
+		return "", apperrors.NewUnauthorizedError("Tenant not found", nil)
+	}
+	if !tenant.IsValidSlug(tenantID) {
+		return "", apperrors.NewBadRequestError("Invalid tenant identifier", nil)
+	}
+	return fmt.Sprintf("media/%s/raw/%s", tenantID, assetID), nil
+}
diff --git a/internal/service/webhook.go b/internal/service/webhook.go
index 97c775d..31c18e7 100644
--- a/internal/service/webhook.go
+++ b/internal/service/webhook.go
@@ -2,6 +2,7 @@ package service
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"net/url"
 
@@ -9,6 +10,7 @@ import (
 	"github.com/rndmcodeguy20/mpiper/internal/config"
 	"github.com/rndmcodeguy20/mpiper/internal/middleware"
 	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	apperrors "github.com/rndmcodeguy20/mpiper/pkg/errors"
 	"github.com/rndmcodeguy20/mpiper/pkg/utils"
 	"go.uber.org/zap"
 )
@@ -37,26 +39,26 @@ func NewWebhookService(repo repository.WebhookRepository, logger *zap.Logger) We
 
 func (s *webhookService) Create(ctx context.Context, reqURL, secret string, events []string) (*repository.WebhookRegistration, error) {
 	if _, err := url.ParseRequestURI(reqURL); err != nil {
-		return nil, fmt.Errorf("invalid url: %w", err)
+		return nil, apperrors.NewBadRequestError("invalid url", err)
 	}
 	if secret == "" {
-		return nil, fmt.Errorf("secret is required")
+		return nil, apperrors.NewBadRequestError("secret is required", nil)
 	}
 	if len(events) == 0 {
-		return nil, fmt.Errorf("at least one event is required")
+		return nil, apperrors.NewBadRequestError("at least one event is required", nil)
 	}
 	for _, e := range events {
 		if !validEvents[e] {
-			return nil, fmt.Errorf("invalid event: %s", e)
+			return nil, apperrors.NewBadRequestError(fmt.Sprintf("invalid event: %s", e), nil)
 		}
 	}
 
-	userID, ok := middleware.GetUserID(ctx)
+	userID, ok := middleware.GetTenant(ctx)
 	if !ok || userID == "" {
-		return nil, fmt.Errorf("user_id not found in context")
+		return nil, apperrors.NewUnauthorizedError("Tenant not found", nil)
 	}
 
-	encryptedSecret, err := utils.GenerateToken(secret, config.MustGet().EncryptionKey)
+	encryptedSecret, err := utils.GenerateToken(secret, config.MustGet().WebhookEncryptionKey)
 	if err != nil {
 		return nil, fmt.Errorf("failed to encrypt secret: %w", err)
 	}
@@ -77,17 +79,23 @@ func (s *webhookService) Create(ctx context.Context, reqURL, secret string, even
 }
 
 func (s *webhookService) List(ctx context.Context) ([]repository.WebhookRegistration, error) {
-	userID, ok := middleware.GetUserID(ctx)
+	userID, ok := middleware.GetTenant(ctx)
 	if !ok || userID == "" {
-		return nil, fmt.Errorf("user_id not found in context")
+		return nil, apperrors.NewUnauthorizedError("Tenant not found", nil)
 	}
 	return s.repo.ListByUser(ctx, userID)
 }
 
 func (s *webhookService) Delete(ctx context.Context, id uuid.UUID) error {
-	userID, ok := middleware.GetUserID(ctx)
+	userID, ok := middleware.GetTenant(ctx)
 	if !ok || userID == "" {
-		return fmt.Errorf("user_id not found in context")
+		return apperrors.NewUnauthorizedError("Tenant not found", nil)
 	}
-	return s.repo.Delete(ctx, id, userID)
+	if err := s.repo.Delete(ctx, id, userID); err != nil {
+		if errors.Is(err, repository.ErrNotFound) {
+			return apperrors.NewNotFoundError("Webhook not found", nil)
+		}
+		return err
+	}
+	return nil
 }
diff --git a/internal/service/webhook_test.go b/internal/service/webhook_test.go
index dc4d9fc..af6e272 100644
--- a/internal/service/webhook_test.go
+++ b/internal/service/webhook_test.go
@@ -8,6 +8,7 @@ import (
 	"github.com/rndmcodeguy20/mpiper/internal/config"
 	"github.com/rndmcodeguy20/mpiper/internal/middleware"
 	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	"github.com/rndmcodeguy20/mpiper/pkg/utils"
 	"go.uber.org/zap"
 )
 
@@ -26,13 +27,14 @@ func (m *mockWebhookRepo) ListByUser(_ context.Context, _ string) ([]repository.
 func (m *mockWebhookRepo) Delete(_ context.Context, _ uuid.UUID, _ string) error { return nil }
 
 func ctxWithUser(userID string) context.Context {
-	return context.WithValue(context.Background(), middleware.UserIDKey(), userID)
+	return middleware.WithTenant(context.Background(), userID)
 }
 
 func init() {
-	// Initialize config singleton for tests (32-byte encryption key).
+	// Initialize config singleton for tests (32-byte encryption keys).
 	config.Init(config.EnvConfig{
-		EncryptionKey: "01234567890123456789012345678901",
+		EncryptionKey:        "01234567890123456789012345678901",
+		WebhookEncryptionKey: "98765432109876543210987654321098",
 	})
 }
 
@@ -113,3 +115,33 @@ func TestWebhookService_Create_NoUserInContext(t *testing.T) {
 		t.Fatal("expected error for missing user in context")
 	}
 }
+
+// TestWebhookService_Create_UsesWebhookKey verifies the stored secret is
+// encrypted with WEBHOOK_ENCRYPTION_KEY (the split key) and NOT the auth
+// ENCRYPTION_KEY — decrypting with the webhook key recovers the plaintext while
+// the auth key fails.
+func TestWebhookService_Create_UsesWebhookKey(t *testing.T) {
+	repo := &mockWebhookRepo{}
+	svc := NewWebhookService(repo, zap.NewNop())
+
+	const plaintext = "my-signing-secret"
+	ctx := ctxWithUser("tenant-1")
+	if _, err := svc.Create(ctx, "https://example.com/hook", plaintext, []string{"job.done"}); err != nil {
+		t.Fatalf("Create: %v", err)
+	}
+	stored := repo.created[0].Secret
+
+	cfg := config.MustGet()
+	got, err := utils.DecryptToken(stored, cfg.WebhookEncryptionKey)
+	if err != nil {
+		t.Fatalf("decrypt with webhook key failed: %v", err)
+	}
+	if got != plaintext {
+		t.Errorf("decrypted = %q, want %q", got, plaintext)
+	}
+
+	// The auth key must NOT decrypt the webhook secret (keys are distinct).
+	if _, err := utils.DecryptToken(stored, cfg.EncryptionKey); err == nil {
+		t.Error("auth ENCRYPTION_KEY should not decrypt a webhook secret — keys are not separated")
+	}
+}
diff --git a/internal/webhook/dispatcher.go b/internal/webhook/dispatcher.go
index 0fb08c9..92405dc 100644
--- a/internal/webhook/dispatcher.go
+++ b/internal/webhook/dispatcher.go
@@ -15,8 +15,12 @@ import (
 
 	"github.com/google/uuid"
 	"github.com/jmoiron/sqlx"
+	"github.com/rndmcodeguy20/mpiper/internal/metrics"
 	"github.com/rndmcodeguy20/mpiper/pkg/utils"
+	"go.opentelemetry.io/otel/attribute"
+	otelmetric "go.opentelemetry.io/otel/metric"
 	"go.uber.org/zap"
+	"golang.org/x/sync/errgroup"
 )
 
 type DispatcherConfig struct {
@@ -26,6 +30,8 @@ type DispatcherConfig struct {
 	MaxAttempts   int
 	EncryptionKey string
 	Retention     time.Duration
+	// Concurrency bounds the number of webhook deliveries in flight per tick.
+	Concurrency int
 }
 
 type Dispatcher struct {
@@ -33,14 +39,29 @@ type Dispatcher struct {
 	logger *zap.Logger
 	client *http.Client
 	cfg    DispatcherConfig
+	m      *metrics.Metrics
 }
 
-func NewDispatcher(db *sqlx.DB, logger *zap.Logger, cfg DispatcherConfig) *Dispatcher {
+func NewDispatcher(db *sqlx.DB, logger *zap.Logger, cfg DispatcherConfig, m *metrics.Metrics) *Dispatcher {
+	if cfg.Concurrency < 1 {
+		cfg.Concurrency = 1
+	}
+	// Tune the transport so concurrent deliveries to the same receiver host
+	// reuse connections. Go's default Transport caps MaxIdleConnsPerHost at 2,
+	// which would serialize TLS handshakes for N concurrent POSTs to one host
+	// and inflate delivery p95. Size the per-host pools to the concurrency.
+	transport := &http.Transport{
+		MaxIdleConns:        cfg.Concurrency * 2,
+		MaxIdleConnsPerHost: cfg.Concurrency,
+		MaxConnsPerHost:     cfg.Concurrency,
+		IdleConnTimeout:     90 * time.Second,
+	}
 	return &Dispatcher{
 		db:     db,
 		logger: logger,
-		client: &http.Client{Timeout: cfg.Timeout},
+		client: &http.Client{Timeout: cfg.Timeout, Transport: transport},
 		cfg:    cfg,
+		m:      m,
 	}
 }
 
@@ -73,6 +94,13 @@ func (d *Dispatcher) Start(ctx context.Context) {
 
 func (d *Dispatcher) tick(ctx context.Context) {
 	rows := make([]deliveryRow, 0, d.cfg.BatchSize)
+	// NOTE: FOR UPDATE ... SKIP LOCKED runs here OUTSIDE an explicit transaction,
+	// so the row locks are released as soon as this SELECT returns. That is safe
+	// for a SINGLE dispatcher process fanning the batch out to internal
+	// goroutines (each row appears once in `rows`, delivered by one goroutine).
+	// It does NOT prevent two SEPARATE dispatcher processes from claiming the
+	// same row. If this is ever scaled to >1 dispatcher, wrap the claim in a tx
+	// for the lifetime of delivery, or add a claimed_at/locked_by column.
 	err := d.db.SelectContext(ctx, &rows,
 		`SELECT wd.id, wd.event, wd.asset_id, wd.job_id, wd.payload, wd.attempts, wr.url, wr.secret
 		 FROM webhook_deliveries wd
@@ -85,16 +113,32 @@ func (d *Dispatcher) tick(ctx context.Context) {
 		d.logger.Error("webhook dispatcher: fetch failed", zap.Error(err))
 		return
 	}
+	if len(rows) == 0 {
+		return
+	}
 
+	// Deliver the batch concurrently, bounded by cfg.Concurrency. Each row is
+	// independent: deliver() and its handleFailure/backoff/markFailed updates
+	// are keyed by the row's own id, so concurrent delivery is race-free.
+	g, gctx := errgroup.WithContext(ctx)
+	g.SetLimit(d.cfg.Concurrency)
 	for _, row := range rows {
-		d.deliver(ctx, row)
+		row := row // capture per-iteration (safe on older Go too)
+		g.Go(func() error {
+			d.deliver(gctx, row)
+			return nil
+		})
 	}
+	// deliver never returns an error (failures are persisted, not propagated),
+	// so Wait only blocks until the batch drains.
+	_ = g.Wait()
 }
 
 func (d *Dispatcher) deliver(ctx context.Context, row deliveryRow) {
 	secret, err := utils.DecryptToken(row.Secret, d.cfg.EncryptionKey)
 	if err != nil {
 		d.logger.Error("webhook: decrypt secret failed", zap.String("delivery_id", row.ID.String()), zap.Error(err))
+		d.recordDelivery(ctx, row.Event, "error", 0, false)
 		d.markFailed(ctx, row.ID)
 		return
 	}
@@ -107,15 +151,19 @@ func (d *Dispatcher) deliver(ctx context.Context, row deliveryRow) {
 	))
 	if err != nil {
 		d.logger.Error("webhook: build request failed", zap.Error(err))
+		d.recordDelivery(ctx, row.Event, "error", 0, false)
 		d.handleFailure(ctx, row)
 		return
 	}
 	req.Header.Set("Content-Type", "application/json")
 	req.Header.Set("X-Webhook-Signature", "sha256="+sig)
 
+	start := time.Now()
 	resp, err := d.client.Do(req)
+	elapsed := time.Since(start)
 	if err != nil {
 		d.logger.Warn("webhook: request failed", zap.String("url", row.URL), zap.Error(err))
+		d.recordDelivery(ctx, row.Event, "error", elapsed, false)
 		d.handleFailure(ctx, row)
 		return
 	}
@@ -124,13 +172,36 @@ func (d *Dispatcher) deliver(ctx context.Context, row deliveryRow) {
 	if resp.StatusCode >= 200 && resp.StatusCode < 300 {
 		_, _ = d.db.ExecContext(ctx,
 			`UPDATE webhook_deliveries SET status = 'delivered', delivered_at = now() WHERE id = $1`, row.ID)
+		d.recordDelivery(ctx, row.Event, "delivered", elapsed, true)
 		d.logger.Debug("webhook delivered", zap.String("id", row.ID.String()), zap.String("url", row.URL))
 	} else {
 		d.logger.Warn("webhook: non-2xx response", zap.String("url", row.URL), zap.Int("status", resp.StatusCode))
+		d.recordDelivery(ctx, row.Event, "failed", elapsed, false)
 		d.handleFailure(ctx, row)
 	}
 }
 
+// recordDelivery records per-delivery metrics. Labels are restricted to the
+// low-cardinality event name and a status bucket (delivered/failed/error) —
+// asset_id and url are deliberately excluded to keep metric cardinality bounded.
+// Duration is only recorded when an HTTP call was actually made (dur > 0).
+func (d *Dispatcher) recordDelivery(ctx context.Context, event, status string, dur time.Duration, success bool) {
+	if d.m == nil {
+		return
+	}
+	attrs := otelmetric.WithAttributes(
+		attribute.String("event", event),
+		attribute.String("status", status),
+	)
+	d.m.WebhookDeliveryTotal.Add(ctx, 1, attrs)
+	if !success {
+		d.m.WebhookDeliveryFailures.Add(ctx, 1, attrs)
+	}
+	if dur > 0 {
+		d.m.WebhookDeliveryDuration.Record(ctx, dur.Seconds(), attrs)
+	}
+}
+
 func (d *Dispatcher) handleFailure(ctx context.Context, row deliveryRow) {
 	newAttempts := row.Attempts + 1
 	if newAttempts >= d.cfg.MaxAttempts {
diff --git a/internal/webhook/dispatcher_integration_test.go b/internal/webhook/dispatcher_integration_test.go
index 81e0e8f..b29e5a4 100644
--- a/internal/webhook/dispatcher_integration_test.go
+++ b/internal/webhook/dispatcher_integration_test.go
@@ -18,11 +18,13 @@ import (
 	"github.com/google/uuid"
 	"github.com/jmoiron/sqlx"
 	_ "github.com/lib/pq"
+	"github.com/rndmcodeguy20/mpiper/internal/metrics"
 	"github.com/rndmcodeguy20/mpiper/internal/webhook"
 	"github.com/rndmcodeguy20/mpiper/pkg/utils"
 	"github.com/testcontainers/testcontainers-go"
 	tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
 	"github.com/testcontainers/testcontainers-go/wait"
+	"go.opentelemetry.io/otel/sdk/metric/metricdata"
 	"go.uber.org/zap"
 )
 
@@ -111,6 +113,7 @@ func TestDispatcher_DeliversSuccessfully(t *testing.T) {
 	// Run dispatcher.
 	dispCtx, cancel := context.WithCancel(ctx)
 	defer cancel()
+	m, reader := metrics.NewTestMetrics()
 	d := webhook.NewDispatcher(db, zap.NewNop(), webhook.DispatcherConfig{
 		PollInterval:  50 * time.Millisecond,
 		BatchSize:     10,
@@ -118,7 +121,7 @@ func TestDispatcher_DeliversSuccessfully(t *testing.T) {
 		MaxAttempts:   5,
 		EncryptionKey: testEncryptionKey,
 		Retention:     168 * time.Hour,
-	})
+	}, m)
 	go d.Start(dispCtx)
 
 	// Wait for delivery.
@@ -147,6 +150,30 @@ func TestDispatcher_DeliversSuccessfully(t *testing.T) {
 	if status != "delivered" {
 		t.Errorf("expected delivered, got %s", status)
 	}
+
+	// Verify the delivery metric was recorded with status=delivered.
+	var rm metricdata.ResourceMetrics
+	if err := reader.Collect(ctx, &rm); err != nil {
+		t.Fatalf("collect metrics: %v", err)
+	}
+	var deliveredTotal int64
+	for _, sm := range rm.ScopeMetrics {
+		for _, mt := range sm.Metrics {
+			if mt.Name != "webhook.delivery.total" {
+				continue
+			}
+			if sum, ok := mt.Data.(metricdata.Sum[int64]); ok {
+				for _, dp := range sum.DataPoints {
+					if v, ok := dp.Attributes.Value("status"); ok && v.AsString() == "delivered" {
+						deliveredTotal += dp.Value
+					}
+				}
+			}
+		}
+	}
+	if deliveredTotal < 1 {
+		t.Errorf("expected >=1 delivered webhook.delivery.total metric, got %d", deliveredTotal)
+	}
 }
 
 func TestDispatcher_RetriesOnFailure(t *testing.T) {
@@ -182,7 +209,7 @@ func TestDispatcher_RetriesOnFailure(t *testing.T) {
 		MaxAttempts:   5,
 		EncryptionKey: testEncryptionKey,
 		Retention:     168 * time.Hour,
-	})
+	}, nil)
 	go d.Start(dispCtx)
 	time.Sleep(300 * time.Millisecond)
 	cancel()
@@ -236,7 +263,7 @@ func TestDispatcher_FailsAfterMaxAttempts(t *testing.T) {
 		MaxAttempts:   5,
 		EncryptionKey: testEncryptionKey,
 		Retention:     168 * time.Hour,
-	})
+	}, nil)
 	go d.Start(dispCtx)
 	time.Sleep(300 * time.Millisecond)
 	cancel()
@@ -247,3 +274,110 @@ func TestDispatcher_FailsAfterMaxAttempts(t *testing.T) {
 		t.Errorf("expected failed, got %s", status)
 	}
 }
+
+// TestDispatcher_DeliversConcurrently verifies that a batch larger than the
+// concurrency limit is delivered in parallel (max in-flight > 1, bounded by the
+// limit), every delivery completes, and the delivery metric counts them all.
+func TestDispatcher_DeliversConcurrently(t *testing.T) {
+	ctx := context.Background()
+	db := setupDB(t, ctx)
+
+	encSecret, _ := utils.GenerateToken("secret", testEncryptionKey)
+
+	const total = 20
+	const concurrency = 5
+
+	var inFlight atomic.Int32
+	var maxInFlight atomic.Int32
+	var delivered atomic.Int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		cur := inFlight.Add(1)
+		for {
+			old := maxInFlight.Load()
+			if cur <= old || maxInFlight.CompareAndSwap(old, cur) {
+				break
+			}
+		}
+		time.Sleep(60 * time.Millisecond) // hold the connection so overlap is observable
+		inFlight.Add(-1)
+		delivered.Add(1)
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer srv.Close()
+
+	regID := uuid.New()
+	_, _ = db.ExecContext(ctx,
+		`INSERT INTO webhook_registrations (id, user_id, url, secret, events) VALUES ($1,$2,$3,$4,$5)`,
+		regID, "user-1", srv.URL, encSecret, `["job.done"]`)
+	for i := 0; i < total; i++ {
+		assetID := uuid.New()
+		payload, _ := json.Marshal(map[string]interface{}{"event": "job.done"})
+		_, _ = db.ExecContext(ctx,
+			`INSERT INTO webhook_deliveries (registration_id, event, asset_id, job_id, payload) VALUES ($1,$2,$3,$4,$5)`,
+			regID, "job.done", assetID, int64(i), payload)
+	}
+
+	dispCtx, cancel := context.WithCancel(ctx)
+	defer cancel()
+	m, reader := metrics.NewTestMetrics()
+	d := webhook.NewDispatcher(db, zap.NewNop(), webhook.DispatcherConfig{
+		PollInterval:  50 * time.Millisecond,
+		BatchSize:     total,
+		Timeout:       5 * time.Second,
+		MaxAttempts:   5,
+		EncryptionKey: testEncryptionKey,
+		Retention:     168 * time.Hour,
+		Concurrency:   concurrency,
+	}, m)
+	go d.Start(dispCtx)
+
+	deadline := time.After(10 * time.Second)
+	for delivered.Load() < total {
+		select {
+		case <-deadline:
+			t.Fatalf("timeout: delivered %d/%d", delivered.Load(), total)
+		default:
+			time.Sleep(20 * time.Millisecond)
+		}
+	}
+	cancel()
+
+	// Parallelism actually happened, and stayed within the bound.
+	if maxInFlight.Load() < 2 {
+		t.Errorf("expected concurrent delivery (max in-flight > 1), got %d", maxInFlight.Load())
+	}
+	if maxInFlight.Load() > concurrency {
+		t.Errorf("max in-flight %d exceeded concurrency limit %d", maxInFlight.Load(), concurrency)
+	}
+
+	// All rows delivered in the DB.
+	var pending int
+	_ = db.GetContext(ctx, &pending, `SELECT count(*) FROM webhook_deliveries WHERE status != 'delivered'`)
+	if pending != 0 {
+		t.Errorf("expected 0 non-delivered rows, got %d", pending)
+	}
+
+	// Metric total counts every delivery.
+	var rm metricdata.ResourceMetrics
+	if err := reader.Collect(ctx, &rm); err != nil {
+		t.Fatalf("collect metrics: %v", err)
+	}
+	var deliveredTotal int64
+	for _, sm := range rm.ScopeMetrics {
+		for _, mt := range sm.Metrics {
+			if mt.Name != "webhook.delivery.total" {
+				continue
+			}
+			if sum, ok := mt.Data.(metricdata.Sum[int64]); ok {
+				for _, dp := range sum.DataPoints {
+					if v, ok := dp.Attributes.Value("status"); ok && v.AsString() == "delivered" {
+						deliveredTotal += dp.Value
+					}
+				}
+			}
+		}
+	}
+	if deliveredTotal != total {
+		t.Errorf("expected %d delivered metric, got %d", total, deliveredTotal)
+	}
+}
diff --git a/internal/webhook/dispatcher_test.go b/internal/webhook/dispatcher_test.go
index 760ea52..cd91f95 100644
--- a/internal/webhook/dispatcher_test.go
+++ b/internal/webhook/dispatcher_test.go
@@ -1,13 +1,104 @@
 package webhook
 
 import (
+	"context"
 	"crypto/hmac"
 	"crypto/sha256"
 	"encoding/hex"
 	"testing"
 	"time"
+
+	"github.com/rndmcodeguy20/mpiper/internal/metrics"
+	"go.opentelemetry.io/otel/sdk/metric/metricdata"
 )
 
+// findSum returns the summed int64 counter value for a metric whose data points
+// all carry the given (event,status) attributes. Returns total across matching
+// points regardless of attributes when matchStatus is empty.
+func sumCounter(t *testing.T, rm *metricdata.ResourceMetrics, name, status string) int64 {
+	t.Helper()
+	var total int64
+	for _, sm := range rm.ScopeMetrics {
+		for _, mt := range sm.Metrics {
+			if mt.Name != name {
+				continue
+			}
+			sum, ok := mt.Data.(metricdata.Sum[int64])
+			if !ok {
+				t.Fatalf("metric %s is not an int64 Sum", name)
+			}
+			for _, dp := range sum.DataPoints {
+				if status == "" {
+					total += dp.Value
+					continue
+				}
+				if v, ok := dp.Attributes.Value("status"); ok && v.AsString() == status {
+					total += dp.Value
+				}
+			}
+		}
+	}
+	return total
+}
+
+func histogramCount(t *testing.T, rm *metricdata.ResourceMetrics, name string) uint64 {
+	t.Helper()
+	var count uint64
+	for _, sm := range rm.ScopeMetrics {
+		for _, mt := range sm.Metrics {
+			if mt.Name != name {
+				continue
+			}
+			h, ok := mt.Data.(metricdata.Histogram[float64])
+			if !ok {
+				t.Fatalf("metric %s is not a float64 Histogram", name)
+			}
+			for _, dp := range h.DataPoints {
+				count += dp.Count
+			}
+		}
+	}
+	return count
+}
+
+func TestRecordDelivery_EmitsMetrics(t *testing.T) {
+	m, reader := metrics.NewTestMetrics()
+	d := &Dispatcher{m: m}
+	ctx := context.Background()
+
+	// One successful delivery (records total + duration, no failure).
+	d.recordDelivery(ctx, "job.done", "delivered", 120*time.Millisecond, true)
+	// One non-2xx failure (records total + failure + duration).
+	d.recordDelivery(ctx, "job.done", "failed", 50*time.Millisecond, false)
+	// One pre-HTTP error (records total + failure, no duration since dur==0).
+	d.recordDelivery(ctx, "job.failed", "error", 0, false)
+
+	var rm metricdata.ResourceMetrics
+	if err := reader.Collect(ctx, &rm); err != nil {
+		t.Fatalf("collect: %v", err)
+	}
+
+	if got := sumCounter(t, &rm, "webhook.delivery.total", ""); got != 3 {
+		t.Errorf("webhook.delivery.total = %d, want 3", got)
+	}
+	if got := sumCounter(t, &rm, "webhook.delivery.total", "delivered"); got != 1 {
+		t.Errorf("delivered total = %d, want 1", got)
+	}
+	if got := sumCounter(t, &rm, "webhook.delivery.failures", ""); got != 2 {
+		t.Errorf("webhook.delivery.failures = %d, want 2", got)
+	}
+	// Only the two calls with dur>0 record into the duration histogram.
+	if got := histogramCount(t, &rm, "webhook.delivery.duration"); got != 2 {
+		t.Errorf("webhook.delivery.duration count = %d, want 2", got)
+	}
+}
+
+func TestRecordDelivery_NilMetricsIsSafe(t *testing.T) {
+	d := &Dispatcher{m: nil}
+	// Must not panic when metrics are not wired.
+	d.recordDelivery(context.Background(), "job.done", "delivered", time.Second, true)
+}
+
 func TestBackoff_ExponentialWithCap(t *testing.T) {
 	tests := []struct {
 		attempt    int
diff --git a/loadtest/README.md b/loadtest/README.md
new file mode 100644
index 0000000..be04b8e
--- /dev/null
+++ b/loadtest/README.md
@@ -0,0 +1,99 @@
+# MPiper Load Harness (k6) — Track 3, Phase 4
+
+Drives the **real** client flow from the host (presign → PUT to MinIO →
+complete), so the whole pipeline — API, outbox relay, Redis, worker, ffmpeg — is
+exercised end-to-end and observable as one trace per asset.
+
+## Install
+
+```bash
+brew install k6          # macOS
+# or see https://grafana.com/docs/k6/latest/set-up/install-k6/
+```
+
+`run.sh` also needs `docker` (to seed a scoped API key into the containerized
+Postgres) and a stdlib `python3` on the host.
+
+## Prerequisites
+
+Bring the stack up **with the observability overlay** (so Prometheus accepts
+k6's remote-write) and ideally the **loadtest overlay** (CPU-pinned, full
+sampling) so runs are reproducible:
+
+```bash
+docker compose \
+  -f docker-compose.yml \
+  -f docker-compose.observability.yml \
+  -f docker-compose.loadtest.yml \
+  up -d --build
+```
+
+## Run
+
+```bash
+# CLOSED model — fixed VUs hammer the system (find max throughput / saturation)
+./loadtest/run.sh closed --vus 10 --duration 2m
+./loadtest/run.sh closed --vus 20 --duration 3m --ramp
+
+# OPEN model — fixed arrival rate (find the latency knee; watch queue lag grow)
+./loadtest/run.sh open --rate 5/s --duration 3m
+./loadtest/run.sh open --rate 10/s --duration 3m --max-vus 400
+```
+
+Options: `--fixture PATH`, `--base-url URL`, `--no-prometheus`.
+
+## A/B contrast (concurrent worker + webhooks)
+
+The concurrency knobs live on `docker-compose.loadtest.yml` as env vars
+(defaults reproduce the single-threaded baseline). Flip them on the **same
+binary** — no new overlays, no code changes — to isolate the concurrency
+variable at a fixed core budget:
+
+```bash
+CF="-f docker-compose.yml -f docker-compose.observability.yml -f docker-compose.loadtest.yml -f docker-compose.webhooks.yml"
+
+# BEFORE — serial
+WORKER_CPUS=4 MAX_CONCURRENT_JOBS=1 WEBHOOK_CONCURRENCY=1  docker compose $CF up -d --build
+./loadtest/run.sh closed --vus 20 --duration 2m
+./loadtest/run.sh capture "BEFORE serial (mcj=1, wc=1)"
+
+# AFTER — concurrent (flip knobs, recreate worker+api, no rebuild)
+WORKER_CPUS=4 MAX_CONCURRENT_JOBS=8 WEBHOOK_CONCURRENCY=10 docker compose $CF up -d --force-recreate worker api
+./loadtest/run.sh closed --vus 20 --duration 2m
+./loadtest/run.sh capture "AFTER concurrent (mcj=8, wc=10)"
+```
+
+`./loadtest/run.sh capture "label"` snapshots the headline signals (worker μ,
+queue depth, webhook pending/rate/p95, DLQ depth, DB pool) from Prometheus —
+run it right after each load run. Also grab `docker stats --no-stream
+mpiper-worker` for worker CPU%.
+
+> The default 1-CPU pin masks the worker win (threads can't exceed one core of
+> CPU work), so the A/B uses `WORKER_CPUS=4` on **both** sides and the
+> `closed` model to measure max sustained μ directly.
+
+## What to watch
+
+- **k6 terminal summary** — client-side request rate, error rate, and the custom
+  trends (`mpiper_presign_latency_ms`, `mpiper_upload_latency_ms`,
+  `mpiper_complete_latency_ms`). Thresholds map to the §4.2 SLOs and fail the run
+  on breach (exit non-zero).
+- **Grafana** (http://localhost:3000) — the Track 3 dashboards: API RED, the
+  app-saturation/USE view (queue depth, in-flight, backlogs), the pipeline
+  funnel, and queue health. In the open model, queue depth climbing while the
+  API stays healthy is the worker bottleneck made visible.
+- **Tempo** — click a latency exemplar on a histogram panel to jump straight to
+  the trace for that asset and see which span dominates.
+
+## Dedup fan-out
+
+The worker dedups by content hash, so identical bytes do almost no work after
+the first asset. The harness appends per-iteration unique bytes **after** the
+JPEG end-of-image marker (decoders ignore trailing bytes), yielding a valid but
+unique-hash image so every iteration costs real work. See `lib.js`.
+
+## Caveat
+
+Local results are **relative**: trust the bottleneck *location* and
+before/after deltas, not absolute throughput. Always record the resource limits
+(from `docker-compose.loadtest.yml`) with each experiment.
diff --git a/loadtest/closed_model.js b/loadtest/closed_model.js
new file mode 100644
index 0000000..f66b37b
--- /dev/null
+++ b/loadtest/closed_model.js
@@ -0,0 +1,42 @@
+// loadtest/closed_model.js
+//
+// CLOSED model: a fixed pool of VUs, each looping the upload flow as fast as
+// the system allows. Good for finding max throughput and the saturation point
+// of the (single-threaded) worker.
+//
+//   VUS=10 DURATION=2m k6 run loadtest/closed_model.js
+//   STAGES=1            -> ramping profile (override via env, see below)
+//
+// Prefer the wrapper: ./loadtest/run.sh closed --vus 10 --duration 2m
+
+import { runUploadFlow, sloThresholds } from "./lib.js";
+
+const VUS = parseInt(__ENV.VUS || "10", 10);
+const DURATION = __ENV.DURATION || "2m";
+
+export const options = {
+  scenarios: {
+    closed: __ENV.RAMP === "1"
+      ? {
+          executor: "ramping-vus",
+          startVUs: 1,
+          stages: [
+            { duration: "30s", target: VUS },
+            { duration: DURATION, target: VUS },
+            { duration: "30s", target: 0 },
+          ],
+          gracefulStop: "30s",
+        }
+      : {
+          executor: "constant-vus",
+          vus: VUS,
+          duration: DURATION,
+          gracefulStop: "30s",
+        },
+  },
+  thresholds: sloThresholds,
+};
+
+export default function () {
+  runUploadFlow();
+}
diff --git a/loadtest/lib.js b/loadtest/lib.js
new file mode 100644
index 0000000..5007d63
--- /dev/null
+++ b/loadtest/lib.js
@@ -0,0 +1,123 @@
+// loadtest/lib.js
+//
+// Shared helpers for the MPiper k6 load harness. Each iteration performs the
+// REAL client flow from the host, exactly like scripts/demo-e2e.sh:
+//
+//   1. POST /api/v1/storage/presign           -> uploadUrl + assetId
+//   2. PUT <uploadUrl> (bytes straight to MinIO at the public endpoint)
+//   3. GET /api/v1/assets/{assetId}/complete  -> enqueues processing
+//
+// Dedup defeat: the worker dedups by content hash, so identical bytes do ~no
+// work after the first asset. We append per-iteration unique bytes AFTER the
+// JPEG end-of-image marker (decoders ignore trailing bytes), giving a valid but
+// unique-hash image so we measure real per-job cost. See track-03 §7.
+
+import http from "k6/http";
+import { check } from "k6";
+import { Trend, Rate, Counter } from "k6/metrics";
+
+// --- Config (host-run; see run.sh) ---------------------------------------
+export const BASE_URL = __ENV.BASE_URL || "http://localhost:5010";
+const TOKEN = __ENV.K6_TOKEN || ""; // minted by run.sh (AES-GCM auth token)
+
+// --- Custom metrics mapped to the SLOs (§4.2) ----------------------------
+export const presignLatency = new Trend("mpiper_presign_latency_ms", true);
+export const uploadLatency = new Trend("mpiper_upload_latency_ms", true);
+export const completeLatency = new Trend("mpiper_complete_latency_ms", true);
+export const flowErrors = new Rate("mpiper_flow_errors");
+export const assetsSubmitted = new Counter("mpiper_assets_submitted");
+
+// --- Fixture (loaded once at init) ---------------------------------------
+// open() resolves relative to this script. 'b' returns an ArrayBuffer.
+const FIXTURE_PATH =
+  __ENV.FIXTURE_PATH || "../worker/tests/test_assets/image.jpg";
+const baseFixture = new Uint8Array(open(FIXTURE_PATH, "b"));
+
+function authHeaders(extra) {
+  return Object.assign({ Authorization: `Bearer ${TOKEN}` }, extra || {});
+}
+
+// Build a unique-hash image: base JPEG + a unique trailer (VU/iter/random).
+function uniqueImageBytes() {
+  const tag = `\nMPIPER-LOADTEST-${__VU}-${__ITER}-${Math.random()}`;
+  // k6's JS runtime has no TextEncoder; the tag is ASCII so charCodeAt suffices.
+  const suffix = new Uint8Array(tag.length);
+  for (let i = 0; i < tag.length; i++) suffix[i] = tag.charCodeAt(i) & 0xff;
+  const out = new Uint8Array(baseFixture.length + suffix.length);
+  out.set(baseFixture, 0);
+  out.set(suffix, baseFixture.length);
+  return out.buffer;
+}
+
+// Run one full presign -> upload -> complete flow. Returns true on success.
+export function runUploadFlow() {
+  const bytes = uniqueImageBytes();
+  const contentType = "image/jpeg";
+
+  // 1. presign
+  const presignRes = http.post(
+    `${BASE_URL}/api/v1/storage/presign`,
+    JSON.stringify({
+      fileName: `loadtest-${__VU}-${__ITER}.jpg`,
+      contentType,
+      size: bytes.byteLength,
+    }),
+    { headers: authHeaders({ "Content-Type": "application/json" }), tags: { step: "presign" } }
+  );
+  presignLatency.add(presignRes.timings.duration);
+  const presignOk = check(presignRes, {
+    "presign 2xx": (r) => r.status >= 200 && r.status < 300,
+  });
+  if (!presignOk) {
+    flowErrors.add(1);
+    return false;
+  }
+
+  const data = presignRes.json("data");
+  if (!data || !data.uploadUrl || !data.assetId) {
+    flowErrors.add(1);
+    return false;
+  }
+
+  // 2. upload bytes straight to object storage (public endpoint)
+  const uploadRes = http.put(data.uploadUrl, bytes, {
+    headers: { "Content-Type": contentType },
+    tags: { step: "upload" },
+  });
+  uploadLatency.add(uploadRes.timings.duration);
+  const uploadOk = check(uploadRes, {
+    "upload 2xx": (r) => r.status >= 200 && r.status < 300,
+  });
+  if (!uploadOk) {
+    flowErrors.add(1);
+    return false;
+  }
+
+  // 3. complete -> enqueue processing
+  const completeRes = http.get(
+    `${BASE_URL}/api/v1/assets/${data.assetId}/complete`,
+    { headers: authHeaders(), tags: { step: "complete" } }
+  );
+  completeLatency.add(completeRes.timings.duration);
+  const completeOk = check(completeRes, {
+    "complete 2xx": (r) => r.status >= 200 && r.status < 300,
+  });
+  if (!completeOk) {
+    flowErrors.add(1);
+    return false;
+  }
+
+  flowErrors.add(0);
+  assetsSubmitted.add(1);
+  return true;
+}
+
+// Thresholds shared by both models, derived from the §4.2 SLOs.
+export const sloThresholds = {
+  // Presign SLO: p95 < 150ms.
+  mpiper_presign_latency_ms: ["p(95)<150"],
+  // End-to-end client errors must stay under 1% (job success SLO > 99%).
+  mpiper_flow_errors: ["rate<0.01"],
+  // Overall check pass rate.
+  checks: ["rate>0.99"],
+};
diff --git a/loadtest/open_model.js b/loadtest/open_model.js
new file mode 100644
index 0000000..ead2bfd
--- /dev/null
+++ b/loadtest/open_model.js
@@ -0,0 +1,35 @@
+// loadtest/open_model.js
+//
+// OPEN model: a fixed arrival rate of new uploads/sec, independent of how fast
+// the system responds. Good for finding the latency knee and watching queue
+// lag grow when arrival rate > service rate — a live demonstration of Little's
+// Law (L = λW). When the worker can't keep up, the Redis stream depth climbs
+// even though the API keeps accepting work.
+//
+//   RATE=5 DURATION=3m k6 run loadtest/open_model.js
+//
+// Prefer the wrapper: ./loadtest/run.sh open --rate 5/s --duration 3m
+
+import { runUploadFlow, sloThresholds } from "./lib.js";
+
+const RATE = parseInt(__ENV.RATE || "5", 10); // iterations/sec
+const DURATION = __ENV.DURATION || "3m";
+const MAX_VUS = parseInt(__ENV.MAX_VUS || String(RATE * 20), 10);
+
+export const options = {
+  scenarios: {
+    open: {
+      executor: "constant-arrival-rate",
+      rate: RATE,
+      timeUnit: "1s",
+      duration: DURATION,
+      preAllocatedVUs: Math.max(10, RATE * 2),
+      maxVUs: MAX_VUS,
+    },
+  },
+  thresholds: sloThresholds,
+};
+
+export default function () {
+  runUploadFlow();
+}
diff --git a/loadtest/run.sh b/loadtest/run.sh
new file mode 100755
index 0000000..6880e92
--- /dev/null
+++ b/loadtest/run.sh
@@ -0,0 +1,129 @@
+#!/usr/bin/env bash
+# loadtest/run.sh — host-run wrapper for the MPiper k6 load harness.
+#
+# Usage:
+#   ./loadtest/run.sh closed --vus 10 --duration 2m [--ramp]
+#   ./loadtest/run.sh open   --rate 5/s --duration 3m [--max-vus 200]
+#   ./loadtest/run.sh capture "label"        # snapshot headline signals from Prometheus
+#
+# Options (closed/open):
+#   --fixture PATH   image fixture to fan out (default worker/tests/test_assets/image.jpg)
+#   --base-url URL   API base (default http://localhost:5010)
+#   --no-prometheus  do not stream k6 metrics to Prometheus remote-write
+#
+# A/B contrast (concurrent worker + webhooks) — same binary, flip env knobs on
+# docker-compose.loadtest.yml (see its header), then run + capture each side:
+#   WORKER_CPUS=4 MAX_CONCURRENT_JOBS=1 WEBHOOK_CONCURRENCY=1  docker compose … up -d --force-recreate worker api
+#   ./loadtest/run.sh closed --vus 20 --duration 2m && ./loadtest/run.sh capture "BEFORE"
+#   WORKER_CPUS=4 MAX_CONCURRENT_JOBS=8 WEBHOOK_CONCURRENCY=10 docker compose … up -d --force-recreate worker api
+#   ./loadtest/run.sh closed --vus 20 --duration 2m && ./loadtest/run.sh capture "AFTER"
+#
+# Requires on the host: k6 (brew install k6), docker (to seed an API key into
+# the containerized Postgres), python3 (stdlib only), and the stack up with the
+# observability overlay (so Prometheus remote-write is enabled).
+set -euo pipefail
+
+MODEL="${1:-}"
+if [[ "$MODEL" != "closed" && "$MODEL" != "open" && "$MODEL" != "capture" ]]; then
+  echo "usage: $0 <closed|open|capture> [options|label]" >&2
+  exit 2
+fi
+shift || true
+
+# --- capture mode: snapshot headline pipeline signals from Prometheus --------
+# Run RIGHT AFTER a load run (instant queries see ~the last few minutes). The
+# remaining args are a free-text label so before/after snapshots are labelled.
+if [[ "$MODEL" == "capture" ]]; then
+  LABEL="${*:-snapshot}"
+  PROM="${PROM_URL:-http://localhost:9090}"
+  _q() {
+    python3 - "$PROM" "$1" <<'PY'
+import json, sys, urllib.parse, urllib.request
+prom, expr = sys.argv[1], sys.argv[2]
+url = f"{prom}/api/v1/query?" + urllib.parse.urlencode({"query": expr})
+try:
+    with urllib.request.urlopen(url, timeout=10) as r:
+        data = json.load(r)
+    res = data["data"]["result"]
+    print("n/a" if not res else f'{float(res[0]["value"][1]):.3f}')
+except Exception as e:
+    print(f"err:{e}")
+PY
+  }
+  echo "========================================================================"
+  echo " MPiper signals — $LABEL"
+  echo " $(date -u +%Y-%m-%dT%H:%M:%SZ)  ·  prom=$PROM"
+  echo "========================================================================"
+  printf "%-42s %s\n" "Worker service rate mu (jobs/s)" "$(_q 'sum(rate(mpiper_mpiper_job_processing_success_total[2m]))')"
+  printf "%-42s %s\n" "Job failures/s"                  "$(_q 'sum(rate(mpiper_mpiper_job_processing_failed_total[2m]))')"
+  printf "%-42s %s\n" "Queue depth (max)"               "$(_q 'max(mpiper_queue_depth)')"
+  printf "%-42s %s\n" "Asset proc mean (s)"             "$(_q 'sum(rate(mpiper_mpiper_asset_processing_duration_seconds_sum[2m])) / clamp_min(sum(rate(mpiper_mpiper_asset_processing_duration_seconds_count[2m])),1)')"
+  printf "%-42s %s\n" "Webhook pending (max)"           "$(_q 'max(mpiper_webhook_pending)')"
+  printf "%-42s %s\n" "Webhook delivery rate (/s)"      "$(_q 'sum(rate(mpiper_webhook_delivery_total[2m]))')"
+  printf "%-42s %s\n" "Webhook delivery failures/s"     "$(_q 'sum(rate(mpiper_webhook_delivery_failures_total[2m]))')"
+  printf "%-42s %s\n" "Webhook delivery p95 (s)"        "$(_q 'histogram_quantile(0.95, sum by (le) (rate(mpiper_webhook_delivery_duration_seconds_bucket[2m])))')"
+  printf "%-42s %s\n" "DLQ depth (max)"                 "$(_q 'max(mpiper_mpiper_dlq_depth)')"
+  printf "%-42s %s\n" "DB connections in-use (max)"     "$(_q 'max(mpiper_db_connections_active)')"
+  printf "%-42s %s\n" "DB connection waits (max)"       "$(_q 'max(mpiper_db_connections_wait_count)')"
+  echo "------------------------------------------------------------------------"
+  echo "tip: also grab 'docker stats --no-stream mpiper-worker' for worker CPU%."
+  exit 0
+fi
+
+VUS=10
+DURATION=""
+RATE=5
+MAX_VUS=""
+RAMP=0
+FIXTURE="worker/tests/test_assets/image.jpg"
+BASE_URL="http://localhost:5010"
+USE_PROM=1
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --vus) VUS="$2"; shift 2 ;;
+    --duration) DURATION="$2"; shift 2 ;;
+    --rate) RATE="${2%/s}"; shift 2 ;;          # accept "5/s" or "5"
+    --max-vus) MAX_VUS="$2"; shift 2 ;;
+    --ramp) RAMP=1; shift ;;
+    --fixture) FIXTURE="$2"; shift 2 ;;
+    --base-url) BASE_URL="$2"; shift 2 ;;
+    --no-prometheus) USE_PROM=0; shift ;;
+    *) echo "unknown option: $1" >&2; exit 2 ;;
+  esac
+done
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+# --- Mint an API key (matches scripts/demo-e2e.sh / README) --------------
+# Seeds a scoped key directly into the containerized Postgres; no AES token.
+# shellcheck source=/dev/null
+. "$REPO_ROOT/scripts/_apikey.sh"
+LOADTEST_TENANT="${LOADTEST_TENANT:-loadtest}"
+K6_TOKEN="$(mint_api_key "$LOADTEST_TENANT")"
+[ -n "$K6_TOKEN" ] || { echo "failed to mint API key" >&2; exit 1; }
+export K6_TOKEN BASE_URL
+export FIXTURE_PATH="$REPO_ROOT/$FIXTURE"
+
+# --- Stream client metrics to the bundled Prometheus (remote-write) ------
+K6_OUT=()
+if [[ "$USE_PROM" == "1" ]]; then
+  export K6_PROMETHEUS_RW_SERVER_URL="${K6_PROMETHEUS_RW_SERVER_URL:-http://localhost:9090/api/v1/write}"
+  export K6_PROMETHEUS_RW_TREND_STATS="p(95),p(99),avg,max"
+  K6_OUT=(-o experimental-prometheus-rw)
+  echo "k6 → Prometheus remote-write at $K6_PROMETHEUS_RW_SERVER_URL"
+fi
+
+cd "$REPO_ROOT"
+
+if [[ "$MODEL" == "closed" ]]; then
+  export VUS DURATION="${DURATION:-2m}" RAMP
+  echo "closed model: VUS=$VUS DURATION=$DURATION RAMP=$RAMP"
+  exec k6 run "${K6_OUT[@]}" loadtest/closed_model.js
+else
+  export RATE DURATION="${DURATION:-3m}"
+  [[ -n "$MAX_VUS" ]] && export MAX_VUS
+  echo "open model: RATE=${RATE}/s DURATION=$DURATION"
+  exec k6 run "${K6_OUT[@]}" loadtest/open_model.js
+fi
diff --git a/observability/grafana/dashboards/dashboards.yaml b/observability/grafana/dashboards/dashboards.yaml
new file mode 100644
index 0000000..63f23a9
--- /dev/null
+++ b/observability/grafana/dashboards/dashboards.yaml
@@ -0,0 +1,24 @@
+# ============================================================================
+# Grafana Dashboards — provider config
+#
+# IMPORTANT: this file must live in the *dashboards* provisioning directory
+# (/etc/grafana/provisioning/dashboards), NOT the datasources one — Grafana
+# only scans this directory for dashboard providers. It was previously misplaced
+# under datasources/, so no JSON dashboards were ever loaded.
+# ============================================================================
+
+apiVersion: 1
+
+providers:
+  - name: 'MPiper'
+    orgId: 1
+    folder: 'MPiper'
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      # Same dir the JSON dashboards are mounted into; Grafana loads *.json as
+      # dashboards and ignores this provider yaml.
+      path: /etc/grafana/provisioning/dashboards
+      foldersFromFilesStructure: true
diff --git a/observability/grafana/dashboards/mpiper-api-red.json b/observability/grafana/dashboards/mpiper-api-red.json
new file mode 100644
index 0000000..7d90e26
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-api-red.json
@@ -0,0 +1,85 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 0,
+  "id": null,
+  "uid": "mpiper-api-red",
+  "title": "MPiper — API RED",
+  "tags": ["mpiper", "track-3", "red"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-30m", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    {
+      "id": 1,
+      "type": "timeseries",
+      "title": "Request rate (req/s) by route",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sum by (http_route) (rate(mpiper_http_server_request_count_total[5m]))",
+          "legendFormat": "{{http_route}}"
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "Error ratio (5xx) by route",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "percentunit", "max": 1, "min": 0 }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sli:http_error_ratio:ratio_rate5m",
+          "legendFormat": "{{http_route}}"
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "Duration p95 by route (exemplars → Tempo)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sli:http_request_latency_seconds:p95",
+          "legendFormat": "{{http_route}} p95",
+          "exemplar": true
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Presign p95 (SLO < 150ms)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 0.15 }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "targets": [
+        { "refId": "A", "expr": "sli:presign_latency_seconds:p95", "legendFormat": "presign p95" }
+      ]
+    }
+  ]
+}
diff --git a/observability/grafana/dashboards/mpiper-experiment-overview.json b/observability/grafana/dashboards/mpiper-experiment-overview.json
new file mode 100644
index 0000000..49d75ac
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-experiment-overview.json
@@ -0,0 +1,212 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 1,
+  "id": null,
+  "uid": "mpiper-experiment-overview",
+  "title": "MPiper — Experiment Overview (Track 3)",
+  "description": "One pane for load experiments: k6 client load alongside server-side API/worker/queue metrics and the SLO summary.",
+  "tags": ["mpiper", "track-3", "experiment", "load"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-1h", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    { "id": 100, "type": "row", "title": "Load — k6 client (host)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } },
+    {
+      "id": 1, "type": "stat", "title": "Active VUs", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 0, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "thresholds" } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "max(k6_vus)", "legendFormat": "VUs" } ]
+    },
+    {
+      "id": 2, "type": "timeseries", "title": "Throughput — iterations/s & assets submitted/s", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 12, "x": 6, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(k6_iterations_total[1m]))", "legendFormat": "iterations/s" },
+        { "refId": "B", "expr": "sum(rate(k6_mpiper_assets_submitted_total[1m]))", "legendFormat": "assets submitted/s" },
+        { "refId": "C", "expr": "sum(rate(k6_http_reqs_total[1m]))", "legendFormat": "http reqs/s" }
+      ]
+    },
+    {
+      "id": 3, "type": "stat", "title": "Client error rate (SLO < 1%)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 18, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.01 } ] } }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "max(k6_mpiper_flow_errors_rate)", "legendFormat": "flow errors" },
+        { "refId": "B", "expr": "max(k6_http_req_failed_rate)", "legendFormat": "http failed" }
+      ]
+    },
+    {
+      "id": 4, "type": "timeseries", "title": "Client step latency p95 (ms)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "max(k6_mpiper_presign_latency_ms_p95)", "legendFormat": "presign p95" },
+        { "refId": "B", "expr": "max(k6_mpiper_upload_latency_ms_p95)", "legendFormat": "upload p95" },
+        { "refId": "C", "expr": "max(k6_mpiper_complete_latency_ms_p95)", "legendFormat": "complete p95" }
+      ]
+    },
+    {
+      "id": 5, "type": "timeseries", "title": "Client HTTP duration p95 / p99 (ms)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "max(k6_http_req_duration_p95)", "legendFormat": "http_req_duration p95" },
+        { "refId": "B", "expr": "max(k6_http_req_duration_p99)", "legendFormat": "http_req_duration p99" }
+      ]
+    },
+
+    { "id": 101, "type": "row", "title": "API — server RED", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 } },
+    {
+      "id": 6, "type": "timeseries", "title": "Request rate by route (req/s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 8, "x": 0, "y": 15 },
+      "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sum by (http_route) (rate(mpiper_http_server_request_count_total[1m]))", "legendFormat": "{{http_route}}" } ]
+    },
+    {
+      "id": 7, "type": "timeseries", "title": "Duration p95 by route (exemplars → Tempo)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 8, "x": 8, "y": 15 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:http_request_latency_seconds:p95", "legendFormat": "{{http_route}}", "exemplar": true } ]
+    },
+    {
+      "id": 8, "type": "timeseries", "title": "In-flight requests & DB query rate", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 8, "x": 16, "y": 15 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(mpiper_http_server_active_requests)", "legendFormat": "in-flight" },
+        { "refId": "B", "expr": "sum(rate(mpiper_db_query_total[1m]))", "legendFormat": "db queries/s" }
+      ]
+    },
+
+    { "id": 102, "type": "row", "title": "Worker — saturation & service rate (μ)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 } },
+    {
+      "id": 9, "type": "timeseries", "title": "Service rate μ: consumed vs completed (jobs/s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_mpiper_queue_message_consumed_total[2m]))", "legendFormat": "consumed/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_job_processing_success_total[2m]))", "legendFormat": "completed/s (μ)" }
+      ]
+    },
+    {
+      "id": 10, "type": "timeseries", "title": "Asset processing p95 by type (s) — exemplars", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "histogram_quantile(0.95, sum by (le, asset_type) (rate(mpiper_mpiper_asset_processing_duration_seconds_bucket[5m])))", "legendFormat": "{{asset_type}} p95", "exemplar": true } ]
+    },
+    {
+      "id": 11, "type": "timeseries", "title": "Asset processing mean (s) by type", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sum by (asset_type)(rate(mpiper_mpiper_asset_processing_duration_seconds_sum[5m])) / sum by (asset_type)(rate(mpiper_mpiper_asset_processing_duration_seconds_count[5m]))", "legendFormat": "{{asset_type}} mean" } ]
+    },
+    {
+      "id": 12, "type": "timeseries", "title": "Job processing p95 (s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_mpiper_job_processing_duration_seconds_bucket[5m])))", "legendFormat": "job p95" } ]
+    },
+
+    { "id": 103, "type": "row", "title": "Pipeline funnel & backlogs", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 } },
+    {
+      "id": 13, "type": "timeseries", "title": "Funnel: uploaded → ready / failed (per s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_asset_upload_total[2m]))", "legendFormat": "uploaded/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_asset_processing_success_total[2m]))", "legendFormat": "ready/s" },
+        { "refId": "C", "expr": "sum(rate(mpiper_mpiper_asset_processing_failed_total[2m]))", "legendFormat": "failed/s" }
+      ]
+    },
+    {
+      "id": 14, "type": "timeseries", "title": "Queue depth (Redis stream length)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 },
+      "fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 20 } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "max(mpiper_queue_depth)", "legendFormat": "queue depth" } ]
+    },
+    {
+      "id": 15, "type": "timeseries", "title": "Backlogs: outbox & webhook pending", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "max(mpiper_outbox_pending)", "legendFormat": "outbox pending" },
+        { "refId": "B", "expr": "max(mpiper_webhook_pending)", "legendFormat": "webhook pending" }
+      ]
+    },
+
+    { "id": 104, "type": "row", "title": "Queue / transport", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 } },
+    {
+      "id": 16, "type": "timeseries", "title": "Publish vs consume (msg/s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 46 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_queue_message_published_total[2m]))", "legendFormat": "published/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_queue_message_consumed_total[2m]))", "legendFormat": "consumed/s" }
+      ]
+    },
+    {
+      "id": 17, "type": "timeseries", "title": "Queue wait p95 & outbox relay lag p95 (s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 46 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sli:queue_wait_seconds:p95", "legendFormat": "queue wait p95 (publish-side)" },
+        { "refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_outbox_relay_lag_seconds_bucket[5m])))", "legendFormat": "outbox relay lag p95" }
+      ]
+    },
+
+    { "id": 105, "type": "row", "title": "SLO summary (vs targets)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 53 } },
+    {
+      "id": 18, "type": "stat", "title": "Presign p95 (< 150ms)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 0, "y": 54 },
+      "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.15 } ] } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:presign_latency_seconds:p95" } ]
+    },
+    {
+      "id": 19, "type": "stat", "title": "Image ready p95 (< 5s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 6, "y": 54 },
+      "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 5 } ] } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:image_ready_latency_seconds:p95" } ]
+    },
+    {
+      "id": 20, "type": "stat", "title": "Queue wait p95 (< 2s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 12, "y": 54 },
+      "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 2 } ] } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:queue_wait_seconds:p95" } ]
+    },
+    {
+      "id": 21, "type": "stat", "title": "Job success rate (> 99%)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 18, "y": 54 },
+      "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:job_success_ratio:ratio_rate5m" } ]
+    },
+
+    { "id": 106, "type": "row", "title": "Database (connection pool & queries)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 } },
+    {
+      "id": 22, "type": "timeseries", "title": "Connection pool (in-use / idle / open vs max)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 61 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "mpiper_db_connections_active", "legendFormat": "in-use" },
+        { "refId": "B", "expr": "mpiper_db_connections_idle", "legendFormat": "idle" },
+        { "refId": "C", "expr": "mpiper_db_connections_open", "legendFormat": "open" },
+        { "refId": "D", "expr": "mpiper_db_connections_max_open", "legendFormat": "max open" }
+      ]
+    },
+    {
+      "id": 23, "type": "timeseries", "title": "DB query rate (q/s), p95 latency & pool waits", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 61 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [
+        { "matcher": { "id": "byName", "options": "query p95" }, "properties": [ { "id": "unit", "value": "s" } ] }
+      ] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_db_query_total[2m]))", "legendFormat": "queries/s" },
+        { "refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_db_query_duration_seconds_bucket[5m])))", "legendFormat": "query p95" },
+        { "refId": "C", "expr": "rate(mpiper_db_connections_wait_count[5m])", "legendFormat": "pool waits/s" }
+      ]
+    }
+  ]
+}
diff --git a/observability/grafana/dashboards/mpiper-metrics.json b/observability/grafana/dashboards/mpiper-metrics.json
index e24f49f..edb9e6c 100644
--- a/observability/grafana/dashboards/mpiper-metrics.json
+++ b/observability/grafana/dashboards/mpiper-metrics.json
@@ -38,7 +38,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -87,7 +87,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_http_server_request_count_total[5m])",
           "refId": "A"
@@ -99,7 +99,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -152,9 +152,9 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_bucket[5m]))",
+          "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))",
           "refId": "A"
         }
       ],
@@ -164,7 +164,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -238,7 +238,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "mpiper_http_server_active_requests",
           "refId": "A"
@@ -250,7 +250,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -299,9 +299,9 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"5..\"}[5m]) / rate(mpiper_http_server_request_count_total[5m])",
+          "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"5..\"}[5m]) / rate(mpiper_http_server_request_count_total[5m])",
           "refId": "A"
         }
       ],
@@ -324,7 +324,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -398,27 +398,27 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, rate(mpiper_http_server_request_duration_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))",
           "legendFormat": "p50",
           "refId": "A"
         },
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, rate(mpiper_http_server_request_duration_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))",
           "legendFormat": "p95",
           "refId": "B"
         },
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_bucket[5m]))",
+          "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))",
           "legendFormat": "p99",
           "refId": "C"
         }
@@ -429,7 +429,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -503,27 +503,27 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"2..\"}[5m])",
+          "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"2..\"}[5m])",
           "legendFormat": "2xx - {{method}} {{path}}",
           "refId": "A"
         },
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"4..\"}[5m])",
+          "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"4..\"}[5m])",
           "legendFormat": "4xx - {{method}} {{path}}",
           "refId": "B"
         },
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"5..\"}[5m])",
+          "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"5..\"}[5m])",
           "legendFormat": "5xx - {{method}} {{path}}",
           "refId": "C"
         }
@@ -547,7 +547,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -621,7 +621,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_asset_upload_total[5m])",
           "legendFormat": "Uploads",
@@ -630,7 +630,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_asset_processing_success_total[5m])",
           "legendFormat": "Success",
@@ -639,7 +639,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_asset_processing_failed_total[5m])",
           "legendFormat": "Failed",
@@ -652,7 +652,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -726,7 +726,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.95, rate(mpiper_asset_upload_duration_bucket[5m]))",
           "legendFormat": "Upload p95",
@@ -735,7 +735,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.95, rate(mpiper_asset_processing_duration_bucket[5m]))",
           "legendFormat": "Processing p95",
@@ -748,7 +748,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -791,7 +791,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.50, rate(mpiper_asset_size_bucket[5m]))",
           "legendFormat": "p50",
@@ -800,7 +800,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.95, rate(mpiper_asset_size_bucket[5m]))",
           "legendFormat": "p95",
@@ -809,7 +809,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.99, rate(mpiper_asset_size_bucket[5m]))",
           "legendFormat": "p99",
@@ -835,7 +835,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -909,7 +909,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.50, rate(mpiper_db_query_duration_bucket[5m]))",
           "legendFormat": "p50",
@@ -918,7 +918,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.95, rate(mpiper_db_query_duration_bucket[5m]))",
           "legendFormat": "p95",
@@ -927,7 +927,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.99, rate(mpiper_db_query_duration_bucket[5m]))",
           "legendFormat": "p99",
@@ -940,7 +940,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -1014,7 +1014,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "mpiper_db_connections_active",
           "legendFormat": "Active",
@@ -1023,7 +1023,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "mpiper_db_connections_idle",
           "legendFormat": "Idle",
@@ -1036,7 +1036,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -1129,7 +1129,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_db_query_success_total[5m])",
           "legendFormat": "Success",
@@ -1138,7 +1138,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_db_query_failed_total[5m])",
           "legendFormat": "Errors",
diff --git a/observability/grafana/dashboards/mpiper-pipeline-funnel.json b/observability/grafana/dashboards/mpiper-pipeline-funnel.json
new file mode 100644
index 0000000..c2b254a
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-pipeline-funnel.json
@@ -0,0 +1,90 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 0,
+  "id": null,
+  "uid": "mpiper-pipeline-funnel",
+  "title": "MPiper — Pipeline Funnel & Stage Latency",
+  "tags": ["mpiper", "track-3", "funnel"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-30m", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    {
+      "id": 1,
+      "type": "timeseries",
+      "title": "Funnel rate: uploaded → processed (success/failed)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_asset_upload_total[5m]))", "legendFormat": "uploaded/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_asset_processing_success_total[5m]))", "legendFormat": "ready/s" },
+        { "refId": "C", "expr": "sum(rate(mpiper_mpiper_asset_processing_failed_total[5m]))", "legendFormat": "failed/s" }
+      ]
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Image ready p95 (SLO < 5s)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 8 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 5 } ] }
+        },
+        "overrides": []
+      },
+      "targets": [ { "refId": "A", "expr": "sli:image_ready_latency_seconds:p95", "legendFormat": "image p95" } ]
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "Video ready p95 (SLO < 60s)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 8 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 60 } ] }
+        },
+        "overrides": []
+      },
+      "targets": [ { "refId": "A", "expr": "sli:video_ready_latency_seconds:p95", "legendFormat": "video p95" } ]
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Job success rate (SLO > 99%)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit", "min": 0, "max": 1,
+          "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] }
+        },
+        "overrides": []
+      },
+      "targets": [ { "refId": "A", "expr": "sli:job_success_ratio:ratio_rate5m", "legendFormat": "success rate" } ]
+    },
+    {
+      "id": 5,
+      "type": "timeseries",
+      "title": "Worker asset processing p95 by type (exemplars → Tempo)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "histogram_quantile(0.95, sum by (le, asset_type) (rate(mpiper_mpiper_asset_processing_duration_seconds_bucket[5m])))",
+          "legendFormat": "{{asset_type}} p95",
+          "exemplar": true
+        }
+      ]
+    }
+  ]
+}
diff --git a/observability/grafana/dashboards/mpiper-queue-health.json b/observability/grafana/dashboards/mpiper-queue-health.json
new file mode 100644
index 0000000..bf46c14
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-queue-health.json
@@ -0,0 +1,68 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 0,
+  "id": null,
+  "uid": "mpiper-queue-health",
+  "title": "MPiper — Queue Health",
+  "tags": ["mpiper", "track-3", "queue"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-30m", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    {
+      "id": 1,
+      "type": "timeseries",
+      "title": "Stream length (queue depth)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "max(mpiper_queue_depth)", "legendFormat": "queue depth" } ]
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "Publish vs consume rate (msg/s)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_queue_message_published_total[5m]))", "legendFormat": "published/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_queue_message_consumed_total[5m]))", "legendFormat": "consumed/s" }
+      ]
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "Queue wait p95 (publish-side proxy; authoritative in Tempo)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:queue_wait_seconds:p95", "legendFormat": "queue wait p95" } ]
+    },
+    {
+      "id": 4,
+      "type": "timeseries",
+      "title": "Webhook delivery p95 (SLO < 10s) + pending",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sli:webhook_delivery_latency_seconds:p95", "legendFormat": "delivery p95" },
+        { "refId": "B", "expr": "sli:webhook_pending:current", "legendFormat": "pending (count)" }
+      ]
+    },
+    {
+      "id": 5,
+      "type": "timeseries",
+      "title": "Dead-letter stream depth (media:jobs:dlq)",
+      "description": "Number of poison/over-retried messages parked in the DLQ. Should sit at ~0; a rising line means jobs are failing permanently and need inspection/replay.",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "max(mpiper_mpiper_dlq_depth)", "legendFormat": "DLQ depth" } ]
+    }
+  ]
+}
diff --git a/observability/grafana/dashboards/mpiper-worker-use.json b/observability/grafana/dashboards/mpiper-worker-use.json
new file mode 100644
index 0000000..0ab7fd2
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-worker-use.json
@@ -0,0 +1,89 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 0,
+  "id": null,
+  "uid": "mpiper-worker-use",
+  "title": "MPiper — Worker / App Saturation (USE)",
+  "tags": ["mpiper", "track-3", "use"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-30m", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    {
+      "id": 1,
+      "type": "row",
+      "title": "Saturation (app-level: no per-container CPU by design)",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "Queue depth (Redis stream length)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sli:queue_depth:current", "legendFormat": "queue depth" }
+      ]
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "In-flight HTTP requests",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(mpiper_http_server_active_requests)", "legendFormat": "active requests" }
+      ]
+    },
+    {
+      "id": 4,
+      "type": "timeseries",
+      "title": "Outbox relay lag p95 (oldest pending age)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_outbox_relay_lag_seconds_bucket[5m])))",
+          "legendFormat": "relay lag p95"
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "type": "timeseries",
+      "title": "Backlogs: outbox + webhook pending",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sli:outbox_pending:current", "legendFormat": "outbox pending" },
+        { "refId": "B", "expr": "sli:webhook_pending:current", "legendFormat": "webhook pending" }
+      ]
+    },
+    {
+      "id": 6,
+      "type": "row",
+      "title": "Errors",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }
+    },
+    {
+      "id": 7,
+      "type": "timeseries",
+      "title": "Queue message failures + outbox publish failures (rate)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_queue_message_failed_total[5m]))", "legendFormat": "queue failures/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_outbox_publish_failures_total[5m]))", "legendFormat": "outbox publish failures/s" }
+      ]
+    }
+  ]
+}
diff --git a/observability/grafana/datasources/dashboards.yml b/observability/grafana/datasources/dashboards.yml
deleted file mode 100644
index 3e5981b..0000000
--- a/observability/grafana/datasources/dashboards.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-# ============================================================================
-# Grafana Dashboards - Auto-provisioning
-# ============================================================================
-
-apiVersion: 1
-
-providers:
-  - name: 'Default'
-    orgId: 1
-    folder: 'MPiper'
-    type: file
-    disableDeletion: false
-    updateIntervalSeconds: 10
-    allowUiUpdates: true
-    options:
-      path: /etc/grafana/provisioning/dashboards
-      foldersFromFilesStructure: true
diff --git a/observability/grafana/datasources/datasources.yml b/observability/grafana/datasources/datasources.yml
index 1d35862..a8514be 100644
--- a/observability/grafana/datasources/datasources.yml
+++ b/observability/grafana/datasources/datasources.yml
@@ -57,8 +57,11 @@ datasources:
     url: http://loki:3100
     jsonData:
       derivedFields:
+        # Match trace_id across formats: worker (trace_id=abc), API JSON
+        # ("trace_id":"abc"), and API dev pretty (trace_id: abc). \W+ swallows
+        # the separator (=, ":", ": ") between the key and the hex value.
         - datasourceUid: tempo
-          matcherRegex: "trace_id=(\\w+)"
+          matcherRegex: "trace_id\\W+(\\w+)"
           name: TraceID
           url: '$${__value.raw}'
     editable: true
diff --git a/observability/prometheus.rules.yml b/observability/prometheus.rules.yml
new file mode 100644
index 0000000..005f54a
--- /dev/null
+++ b/observability/prometheus.rules.yml
@@ -0,0 +1,126 @@
+# ============================================================================
+# MPiper — SLO recording rules (Track 3, Phase 3)
+#
+# Encodes the SLIs from track-03-observability-and-load.md §4.2 as recording
+# rules so each renders directly on a dashboard against its target.
+#
+# Metric naming convention
+# -------------------------
+# App telemetry is OTLP → the collector's Prometheus exporter, which is
+# configured with `namespace: mpiper` (see observability/otel-collector.yml).
+# OTel instrument names have dots replaced by underscores and the unit appended:
+#
+#   Go API   http.server.request.duration (s)  -> mpiper_http_server_request_duration_seconds_{bucket,sum,count}
+#   Go API   asset.processing.success (counter) -> mpiper_asset_processing_success_total
+#   Worker   mpiper.job.processing.success      -> mpiper_mpiper_job_processing_success_total
+#            (worker instruments already carry a `mpiper.` prefix, so the
+#             collector namespace yields a double `mpiper_mpiper_` prefix)
+#
+# If your collector build sets `add_metric_suffixes: false` or a different
+# namespace, adjust the series names below. Validate with:
+#   promtool check rules observability/prometheus.rules.yml
+# and against live series in Prometheus → Status → Targets after a load run.
+# ============================================================================
+
+groups:
+  - name: mpiper_slo
+    interval: 15s
+    rules:
+      # --- Presign latency: p95 of POST /api/v1/storage/presign  (target < 150ms)
+      - record: sli:presign_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_http_server_request_duration_seconds_bucket{http_route="/api/v1/storage/presign"}[5m])
+            )
+          )
+
+      # --- API overall request latency p95 per route (RED "Duration")
+      - record: sli:http_request_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le, http_route) (
+              rate(mpiper_http_server_request_duration_seconds_bucket[5m])
+            )
+          )
+
+      # --- API error ratio per route (RED "Errors"): 5xx / all
+      - record: sli:http_error_ratio:ratio_rate5m
+        expr: >
+          sum by (http_route) (
+            rate(mpiper_http_server_request_count_total{http_status_code=~"5.."}[5m])
+          )
+          /
+          clamp_min(
+            sum by (http_route) (rate(mpiper_http_server_request_count_total[5m])),
+            1
+          )
+
+      # --- Image ready latency: p95 of worker asset processing for images.
+      # Server-side processing proxy for complete -> ready; queue wait is added
+      # on top and is best read from the trace waterfall (enqueue -> consume).
+      - record: sli:image_ready_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_mpiper_asset_processing_duration_seconds_bucket{asset_type="image"}[5m])
+            )
+          )
+
+      # --- Video ready latency: p95 of worker asset processing for videos  (target < 60s)
+      - record: sli:video_ready_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_mpiper_asset_processing_duration_seconds_bucket{asset_type="video"}[5m])
+            )
+          )
+
+      # --- Queue wait: p95 of the producer-side processing lag histogram.
+      # The authoritative enqueue -> consume gap is visible per-asset in Tempo;
+      # this metric approximates the publish-side contribution.
+      - record: sli:queue_wait_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_queue_processing_lag_seconds_bucket[5m])
+            )
+          )
+
+      # --- Job success rate: done / (done + failed)  (target > 99%)
+      # `or vector(0)` guards the empty-vector pitfall: with zero failures the
+      # failed series does not exist, and a bare A + B would yield no result.
+      - record: sli:job_success_ratio:ratio_rate5m
+        expr: >
+          (sum(rate(mpiper_mpiper_job_processing_success_total[5m])) or vector(0))
+          /
+          clamp_min(
+            (sum(rate(mpiper_mpiper_job_processing_success_total[5m])) or vector(0))
+            + (sum(rate(mpiper_mpiper_job_processing_failed_total[5m])) or vector(0)),
+            1
+          )
+
+      # --- Webhook delivery latency: p95 of delivery HTTP calls  (target < 10s)
+      - record: sli:webhook_delivery_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_webhook_delivery_duration_seconds_bucket[5m])
+            )
+          )
+
+      # --- Queue health snapshots (gauges already exported by the app)
+      - record: sli:queue_depth:current
+        expr: max(mpiper_queue_depth)
+
+      - record: sli:outbox_pending:current
+        expr: max(mpiper_outbox_pending)
+
+      - record: sli:webhook_pending:current
+        expr: max(mpiper_webhook_pending)
diff --git a/observability/prometheus.yml b/observability/prometheus.yml
index a628c78..bf75a95 100644
--- a/observability/prometheus.yml
+++ b/observability/prometheus.yml
@@ -9,6 +9,11 @@ global:
     cluster: 'mpiper-local'
     environment: 'development'
 
+# SLO recording rules (Track 3, Phase 3). Mounted into the container at
+# /etc/prometheus/ alongside this file by docker-compose.observability.yml.
+rule_files:
+  - /etc/prometheus/prometheus.rules.yml
+
 # Alertmanager configuration (optional)
 # alerting:
 #   alertmanagers:
diff --git a/pkg/errors/api.go b/pkg/errors/api.go
index e092851..58acca4 100644
--- a/pkg/errors/api.go
+++ b/pkg/errors/api.go
@@ -18,6 +18,9 @@ type BadRequestError struct{ *ApiError }
 type InternalServerErrorError struct{ *ApiError }
 type UnauthorizedError struct{ *ApiError }
 type ConflictError struct{ *ApiError }
+type UnprocessableEntityError struct{ *ApiError }
+type ForbiddenError struct{ *ApiError }
+type TooManyRequestsError struct{ *ApiError }
 
 func NewNotFoundError(message string, cause error) *NotFoundError {
 	return &NotFoundError{&ApiError{
@@ -64,6 +67,33 @@ func NewConflictError(message string, cause error) *ConflictError {
 	}}
 }
 
+func NewUnprocessableEntityError(message string, cause error) *UnprocessableEntityError {
+	return &UnprocessableEntityError{&ApiError{
+		Message:    message,
+		Code:       "UNPROCESSABLE_ENTITY_ERROR",
+		Details:    unwrapErrorDetails(cause),
+		StatusCode: http.StatusUnprocessableEntity,
+	}}
+}
+
+func NewForbiddenError(message string, cause error) *ForbiddenError {
+	return &ForbiddenError{&ApiError{
+		Message:    message,
+		Code:       "FORBIDDEN_ERROR",
+		Details:    unwrapErrorDetails(cause),
+		StatusCode: http.StatusForbidden,
+	}}
+}
+
+func NewTooManyRequestsError(message string, cause error) *TooManyRequestsError {
+	return &TooManyRequestsError{&ApiError{
+		Message:    message,
+		Code:       "TOO_MANY_REQUESTS_ERROR",
+		Details:    unwrapErrorDetails(cause),
+		StatusCode: http.StatusTooManyRequests,
+	}}
+}
+
 func unwrapErrorDetails(err error) interface{} {
 	if err == nil {
 		return nil
diff --git a/pkg/utils/apikey.go b/pkg/utils/apikey.go
new file mode 100644
index 0000000..6e479f6
--- /dev/null
+++ b/pkg/utils/apikey.go
@@ -0,0 +1,100 @@
+package utils
+
+import (
+	"crypto/rand"
+	"crypto/sha256"
+	"crypto/subtle"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"strings"
+	"time"
+)
+
+// API key wire format: mp_<prefix>_<secret>
+//
+//   - "mp"     a fixed scheme tag so keys are recognizable in logs/UIs.
+//   - prefix   a short, PUBLIC identifier (hex). Stored in cleartext and used to
+//              narrow the hash lookup; safe to display in key listings.
+//   - secret   high-entropy random (hex). Never stored; only its hash is kept.
+//
+// Both prefix and secret are hex-encoded so the '_' delimiter never collides
+// with the encoding alphabet (unlike base64url, which uses '_').
+const (
+	apiKeyScheme    = "mp"
+	apiKeyPrefixLen = 4  // bytes -> 8 hex chars
+	apiKeySecretLen = 24 // bytes -> 48 hex chars (192 bits of entropy)
+)
+
+var (
+	// ErrMalformedAPIKey is returned when a presented key does not match the
+	// mp_<prefix>_<secret> shape.
+	ErrMalformedAPIKey = errors.New("malformed api key")
+)
+
+// APIKeyMaterial is the result of minting a new API key. Full is shown to the
+// caller exactly once; only Hash (and the public Prefix) are persisted.
+type APIKeyMaterial struct {
+	Full   string // mp_<prefix>_<secret> — show once, never store
+	Prefix string // public lookup hint
+	Hash   string // SHA-256 hex of Full — store this
+}
+
+// GenerateAPIKey mints a new random API key.
+func GenerateAPIKey() (APIKeyMaterial, error) {
+	prefixBytes := make([]byte, apiKeyPrefixLen)
+	if _, err := rand.Read(prefixBytes); err != nil {
+		return APIKeyMaterial{}, fmt.Errorf("generate api key prefix: %w", err)
+	}
+	secretBytes := make([]byte, apiKeySecretLen)
+	if _, err := rand.Read(secretBytes); err != nil {
+		return APIKeyMaterial{}, fmt.Errorf("generate api key secret: %w", err)
+	}
+
+	prefix := hex.EncodeToString(prefixBytes)
+	secret := hex.EncodeToString(secretBytes)
+	full := fmt.Sprintf("%s_%s_%s", apiKeyScheme, prefix, secret)
+
+	return APIKeyMaterial{
+		Full:   full,
+		Prefix: prefix,
+		Hash:   HashAPIKey(full),
+	}, nil
+}
+
+// HashAPIKey returns the lowercase hex SHA-256 of a full API key. API keys are
+// high-entropy, so a fast hash with an indexed equality lookup is appropriate
+// (and unlike bcrypt, allows the DB to index key_hash).
+func HashAPIKey(full string) string {
+	sum := sha256.Sum256([]byte(full))
+	return hex.EncodeToString(sum[:])
+}
+
+// ParseAPIKey validates the wire format and returns the public prefix. The
+// secret is intentionally not returned — callers authenticate by hashing the
+// full key, not by inspecting the secret.
+func ParseAPIKey(full string) (prefix string, err error) {
+	parts := strings.SplitN(full, "_", 3)
+	if len(parts) != 3 || parts[0] != apiKeyScheme || parts[1] == "" || parts[2] == "" {
+		return "", ErrMalformedAPIKey
+	}
+	return parts[1], nil
+}
+
+// ConstantTimeHashEqual compares two key hashes without leaking timing
+// information. Lookups are by indexed equality, but this guards any in-process
+// comparison path.
+func ConstantTimeHashEqual(a, b string) bool {
+	return subtle.ConstantTimeCompare([]byte(a), []byte(b)) == 1
+}
+
+// IsExpired reports whether expiresAt is set and in the past relative to now.
+// A nil expiresAt means the key never expires.
+func IsExpired(expiresAt *time.Time, now time.Time) bool {
+	return expiresAt != nil && !now.Before(*expiresAt)
+}
+
+// IsRevoked reports whether revokedAt is set (i.e. the key has been revoked).
+func IsRevoked(revokedAt *time.Time) bool {
+	return revokedAt != nil
+}
diff --git a/pkg/utils/apikey_test.go b/pkg/utils/apikey_test.go
new file mode 100644
index 0000000..a1e3c1a
--- /dev/null
+++ b/pkg/utils/apikey_test.go
@@ -0,0 +1,120 @@
+package utils
+
+import (
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestGenerateAPIKey_RoundTrip(t *testing.T) {
+	mat, err := GenerateAPIKey()
+	if err != nil {
+		t.Fatalf("GenerateAPIKey: %v", err)
+	}
+
+	if !strings.HasPrefix(mat.Full, "mp_") {
+		t.Errorf("full key %q does not start with mp_", mat.Full)
+	}
+
+	// Re-hashing the printed full key must match the stored hash (the property
+	// the auth middleware relies on at login time).
+	if got := HashAPIKey(mat.Full); got != mat.Hash {
+		t.Errorf("re-hash mismatch: got %q want %q", got, mat.Hash)
+	}
+
+	// The parsed prefix must equal the stored public prefix.
+	prefix, err := ParseAPIKey(mat.Full)
+	if err != nil {
+		t.Fatalf("ParseAPIKey: %v", err)
+	}
+	if prefix != mat.Prefix {
+		t.Errorf("parsed prefix %q != material prefix %q", prefix, mat.Prefix)
+	}
+
+	if len(mat.Hash) != 64 {
+		t.Errorf("hash length = %d, want 64 (sha256 hex)", len(mat.Hash))
+	}
+}
+
+func TestGenerateAPIKey_Unique(t *testing.T) {
+	seen := make(map[string]struct{})
+	for i := 0; i < 100; i++ {
+		mat, err := GenerateAPIKey()
+		if err != nil {
+			t.Fatalf("GenerateAPIKey: %v", err)
+		}
+		if _, dup := seen[mat.Full]; dup {
+			t.Fatalf("duplicate key generated: %q", mat.Full)
+		}
+		seen[mat.Full] = struct{}{}
+	}
+}
+
+func TestParseAPIKey_Malformed(t *testing.T) {
+	cases := []string{
+		"",
+		"not-a-key",
+		"mp_only",
+		"xx_prefix_secret", // wrong scheme
+		"mp__secret",       // empty prefix
+		"mp_prefix_",       // empty secret
+		"Bearer mp_a_b",
+	}
+	for _, c := range cases {
+		if _, err := ParseAPIKey(c); err == nil {
+			t.Errorf("ParseAPIKey(%q) = nil error, want error", c)
+		}
+	}
+}
+
+func TestHashAPIKey_Deterministic(t *testing.T) {
+	const k = "mp_abcd1234_deadbeef"
+	if HashAPIKey(k) != HashAPIKey(k) {
+		t.Error("HashAPIKey is not deterministic")
+	}
+	if HashAPIKey(k) == HashAPIKey(k+"x") {
+		t.Error("distinct keys hashed to the same value")
+	}
+}
+
+func TestIsExpired(t *testing.T) {
+	now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC)
+	past := now.Add(-time.Hour)
+	future := now.Add(time.Hour)
+
+	if IsExpired(nil, now) {
+		t.Error("nil expiresAt should never be expired")
+	}
+	if !IsExpired(&past, now) {
+		t.Error("past expiresAt should be expired")
+	}
+	if IsExpired(&future, now) {
+		t.Error("future expiresAt should not be expired")
+	}
+	// Exactly at expiry is treated as expired (not Before).
+	if !IsExpired(&now, now) {
+		t.Error("expiresAt == now should be expired")
+	}
+}
+
+func TestIsRevoked(t *testing.T) {
+	if IsRevoked(nil) {
+		t.Error("nil revokedAt should not be revoked")
+	}
+	ts := time.Now()
+	if !IsRevoked(&ts) {
+		t.Error("set revokedAt should be revoked")
+	}
+}
+
+func TestConstantTimeHashEqual(t *testing.T) {
+	if !ConstantTimeHashEqual("abc", "abc") {
+		t.Error("equal hashes should compare equal")
+	}
+	if ConstantTimeHashEqual("abc", "abd") {
+		t.Error("different hashes should not compare equal")
+	}
+	if ConstantTimeHashEqual("abc", "abcd") {
+		t.Error("different-length hashes should not compare equal")
+	}
+}
diff --git a/pkg/utils/response.go b/pkg/utils/response.go
index 85c2673..2198e57 100644
--- a/pkg/utils/response.go
+++ b/pkg/utils/response.go
@@ -37,6 +37,12 @@ func WriteErrorResponse(w http.ResponseWriter, err error) {
 		apiErr = e.ApiError
 	case *errors.ConflictError:
 		apiErr = e.ApiError
+	case *errors.UnprocessableEntityError:
+		apiErr = e.ApiError
+	case *errors.ForbiddenError:
+		apiErr = e.ApiError
+	case *errors.TooManyRequestsError:
+		apiErr = e.ApiError
 	default:
 		apiErr = &errors.ApiError{
 			Message:    "Internal server error",
diff --git a/pkg/utils/storagex/s3_test.go b/pkg/utils/storagex/s3_test.go
index 3a1a95e..2cf806a 100644
--- a/pkg/utils/storagex/s3_test.go
+++ b/pkg/utils/storagex/s3_test.go
@@ -100,7 +100,7 @@ func TestS3PresignAndPublicURLEndpoints(t *testing.T) {
 		internal = "http://minio:9000"
 		public   = "http://localhost:9000"
 		bucket   = "mpiper"
-		key      = "media/raw/abc"
+		key      = "media/tenant-abc/raw/abc"
 	)
 
 	t.Run("public endpoint set: presign + PublicURL use public host", func(t *testing.T) {
diff --git a/pkg/utils/tenant/tenant.go b/pkg/utils/tenant/tenant.go
new file mode 100644
index 0000000..3916507
--- /dev/null
+++ b/pkg/utils/tenant/tenant.go
@@ -0,0 +1,17 @@
+// Package tenant provides shared validation for tenant identifiers used as
+// storage key prefixes. Keeping the slug rules in one place ensures the CLI
+// that mints API keys and the service that derives storage paths agree on
+// what counts as a valid tenant.
+package tenant
+
+import "regexp"
+
+// slugRe matches the tenant identifiers we accept: lowercase alphanumerics,
+// underscore, and hyphen, between 1 and 64 characters. Anchors are required
+// so partial matches are rejected.
+var slugRe = regexp.MustCompile("^[a-z0-9_-]{1,64}$")
+
+// IsValidSlug reports whether s is a usable tenant identifier.
+func IsValidSlug(s string) bool {
+	return slugRe.MatchString(s)
+}
diff --git a/pkg/utils/tenant/tenant_test.go b/pkg/utils/tenant/tenant_test.go
new file mode 100644
index 0000000..0e9f3d7
--- /dev/null
+++ b/pkg/utils/tenant/tenant_test.go
@@ -0,0 +1,88 @@
+package tenant
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestIsValidSlug(t *testing.T) {
+	// Build boundary strings of exact length to avoid off-by-one errors
+	// when counting by hand.
+	boundary64 := strings.Repeat("a", 64)
+	boundary65 := strings.Repeat("a", 65)
+	boundary128 := strings.Repeat("a", 128)
+
+	tests := []struct {
+		name  string
+		input string
+		want  bool
+	}{
+		// Valid cases
+		{"simple lowercase", "acme", true},
+		{"with digits", "team42", true},
+		{"with underscore", "demo_user", true},
+		{"with hyphen", "demo-user", true},
+		{"mixed", "acme_corp-2026", true},
+		{"single char", "a", true},
+		{"single digit", "7", true},
+		{"single underscore", "_", true},
+		{"single hyphen", "-", true},
+
+		// Boundary: exactly 64 chars (max accepted)
+		{"64 chars exact", boundary64, true},
+
+		// Invalid: empty
+		{"empty string", "", false},
+
+		// Invalid: 65 chars (one past the limit)
+		{"65 chars one over", boundary65, false},
+
+		// Invalid: well past the limit
+		{"128 chars", boundary128, false},
+
+		// Invalid: uppercase
+		{"uppercase rejected", "Acme", false},
+
+		// Invalid: space
+		{"space rejected", "demo user", false},
+
+		// Invalid: slash (path separator, would break storage keys)
+		{"slash rejected", "demo/user", false},
+
+		// Invalid: dot
+		{"dot rejected", "demo.user", false},
+
+		// Invalid: plus
+		{"plus rejected", "demo+user", false},
+
+		// Invalid: at sign
+		{"at rejected", "demo@user", false},
+
+		// Invalid: colon
+		{"colon rejected", "demo:user", false},
+
+		// Invalid: unicode
+		{"unicode rejected", "démö", false},
+
+		// Invalid: newline
+		{"newline rejected", "demo\nuser", false},
+
+		// Invalid: null byte
+		{"null byte rejected", "demo\x00user", false},
+
+		// The pattern allows leading underscore/hyphen. Verify that is
+		// intentionally accepted so callers know to enforce stricter rules
+		// at the policy layer if needed.
+		{"leading hyphen", "-leading", true},
+		{"leading underscore", "_leading", true},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := IsValidSlug(tc.input)
+			if got != tc.want {
+				t.Errorf("IsValidSlug(%q) = %v, want %v", tc.input, got, tc.want)
+			}
+		})
+	}
+}
diff --git a/pyproject.toml b/pyproject.toml
index 6df13f2..e478938 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,3 +55,6 @@ mpiper_consumer = "worker.consumer:main"
 [tool.poetry.group.dev.dependencies]
 black = "^25.12.0"
 
+[tool.pytest.ini_options]
+pythonpath = ["."]
+
diff --git a/scripts/_apikey.sh b/scripts/_apikey.sh
new file mode 100644
index 0000000..0bc46f2
--- /dev/null
+++ b/scripts/_apikey.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# scripts/_apikey.sh
+#
+# Shared helper for the dev/e2e/loadtest scripts: mint a scoped API key and seed
+# it directly into Postgres via `docker exec psql`. This mirrors what
+# `cmd/mint-api-key` does (mp_<prefix>_<secret>, SHA-256-hashed at rest) but runs
+# against the containerized DB without needing the Go toolchain or an exposed DB
+# port on the host. Only a stdlib python3 is required (hashlib + os.urandom).
+#
+# Source this file, then call:  KEY="$(mint_api_key <tenant>)"
+#
+# Honors PG_CONTAINER / PG_USER / PG_DB (defaults: mpiper-postgres / mpiper / mpiper).
+
+PG_CONTAINER="${PG_CONTAINER:-mpiper-postgres}"
+PG_USER="${PG_USER:-mpiper}"
+PG_DB="${PG_DB:-mpiper}"
+APIKEY_PYTHON_BIN="${APIKEY_PYTHON_BIN:-python3}"
+
+# gen_api_key prints "<full> <hash> <prefix>" for a fresh key. The format and
+# hashing MUST match pkg/utils/apikey.go (mp_<4-byte-hex>_<24-byte-hex>,
+# key_hash = sha256_hex(full)).
+gen_api_key() {
+  "$APIKEY_PYTHON_BIN" - <<'PY'
+import os, hashlib
+prefix = os.urandom(4).hex()
+secret = os.urandom(24).hex()
+full = f"mp_{prefix}_{secret}"
+print(full, hashlib.sha256(full.encode()).hexdigest(), prefix)
+PY
+}
+
+# mint_api_key <tenant> — inserts a key row and echoes the plaintext key.
+mint_api_key() {
+  local tenant="$1"
+  local full hash prefix
+  read -r full hash prefix < <(gen_api_key)
+  docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -tAc \
+    "INSERT INTO api_keys (tenant_id, key_hash, prefix, scopes) VALUES ('${tenant}', '${hash}', '${prefix}', '[\"assets:write\",\"webhooks:write\"]'::jsonb);" \
+    >/dev/null
+  echo "$full"
+}
diff --git a/scripts/demo-e2e.sh b/scripts/demo-e2e.sh
index 66ad7e6..db39a49 100755
--- a/scripts/demo-e2e.sh
+++ b/scripts/demo-e2e.sh
@@ -17,9 +17,9 @@
 #
 #   ./scripts/demo-e2e.sh
 #
-# Requirements on the host: bash, curl, jq, docker, and a python3 with the
-# `cryptography` package (used only to mint the auth token, matching
-# pkg/utils/crypt.go). Override defaults via env vars (API, ENCRYPTION_KEY, …).
+# Requirements on the host: bash, curl, jq, docker, and a python3 (stdlib only —
+# used to mint an API key, matching pkg/utils/apikey.go). Override defaults via
+# env vars (API, …).
 
 set -uo pipefail
 
@@ -27,8 +27,7 @@ set -uo pipefail
 # Configuration
 # ---------------------------------------------------------------------------
 API="${API:-http://localhost:5010}"
-ENCRYPTION_KEY="${ENCRYPTION_KEY:-}"
-USER_ID="${USER_ID:-demo-user}"
+TENANT="${TENANT:-${USER_ID:-demo-user}}"
 WEBHOOK_RECEIVER_URL="${WEBHOOK_RECEIVER_URL:-http://webhook-receiver:8080}"  # internal docker name; reached by the in-container dispatcher
 WEBHOOK_SECRET="${WEBHOOK_SECRET:-demo-webhook-secret}"
 PG_CONTAINER="${PG_CONTAINER:-mpiper-postgres}"
@@ -71,22 +70,20 @@ for bin in curl jq docker; do
   command -v "$bin" >/dev/null 2>&1 || die "'$bin' is required but not installed."
 done
 
-# Pick a python3 that can import cryptography (for token minting).
+# Pick a python3 (stdlib only — used to mint an API key).
 PYTHON_BIN=""
 for cand in python3 python; do
-  if command -v "$cand" >/dev/null 2>&1 && "$cand" -c "import cryptography" >/dev/null 2>&1; then
+  if command -v "$cand" >/dev/null 2>&1; then
     PYTHON_BIN="$cand"; break
   fi
 done
-[ -n "$PYTHON_BIN" ] || die "Need a python3 with the 'cryptography' package on PATH (pip install cryptography)."
+[ -n "$PYTHON_BIN" ] || die "Need a python3 on PATH."
 info "Using python: $(command -v "$PYTHON_BIN")"
 
-# Resolve the encryption key. Prefer the env var; otherwise read it from .env.local.
-if [ -z "$ENCRYPTION_KEY" ] && [ -f "$ROOT_DIR/.env.local" ]; then
-  ENCRYPTION_KEY="$(grep -E '^ENCRYPTION_KEY=' "$ROOT_DIR/.env.local" | head -1 | cut -d= -f2-)"
-fi
-[ -n "$ENCRYPTION_KEY" ] || die "ENCRYPTION_KEY not set and not found in .env.local."
-[ "${#ENCRYPTION_KEY}" -eq 32 ] || die "ENCRYPTION_KEY must be exactly 32 bytes (got ${#ENCRYPTION_KEY})."
+# API-key minting helper (seeds a key directly into the containerized Postgres).
+# shellcheck source=/dev/null
+. "$ROOT_DIR/scripts/_apikey.sh"
+APIKEY_PYTHON_BIN="$PYTHON_BIN"
 
 [ -f "$IMAGE_FILE" ] || die "Image fixture not found: $IMAGE_FILE"
 [ -f "$VIDEO_FILE" ] || die "Video fixture not found: $VIDEO_FILE (generate with ffmpeg or set VIDEO_FILE)."
@@ -120,26 +117,14 @@ fi
 # ---------------------------------------------------------------------------
 psql_q() { docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -tAc "$1" 2>/dev/null; }
 
-mint_token() {
-  ENCRYPTION_KEY="$ENCRYPTION_KEY" USER_ID="$USER_ID" "$PYTHON_BIN" - <<'PY'
-import base64, os
-from cryptography.hazmat.primitives.ciphers.aead import AESGCM
-key = os.environ["ENCRYPTION_KEY"].encode()
-uid = os.environ["USER_ID"].encode()
-nonce = os.urandom(12)
-ct = AESGCM(key).encrypt(nonce, uid, None)
-print(base64.urlsafe_b64encode(nonce + ct).rstrip(b"=").decode())
-PY
-}
-
 # ---------------------------------------------------------------------------
-# Auth token + webhook registration
+# Auth (API key) + webhook registration
 # ---------------------------------------------------------------------------
-step "Mint auth token (user=$USER_ID)"
-TOKEN="$(mint_token)" || die "token generation failed"
-[ -n "$TOKEN" ] || die "empty token"
+step "Mint API key (tenant=$TENANT)"
+TOKEN="$(mint_api_key "$TENANT")" || die "api key minting failed"
+[ -n "$TOKEN" ] || die "empty api key"
 AUTH="Authorization: Bearer $TOKEN"
-pass "Token minted (${TOKEN:0:16}…)"
+pass "API key minted (${TOKEN:0:16}…)"
 
 step "Register webhook"
 REG_RESP="$(curl -fsS -X POST "$API/api/v1/webhooks" \
diff --git a/scripts/mint-api-key.sh b/scripts/mint-api-key.sh
new file mode 100755
index 0000000..b92277d
--- /dev/null
+++ b/scripts/mint-api-key.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# scripts/mint-api-key.sh
+#
+# Thin wrapper around `go run ./cmd/mint-api-key` that mints a scoped API key
+# for a tenant and prints the plaintext key (shown ONCE) to stdout. The
+# human-readable summary is printed to stderr, so capture the key with:
+#
+#   KEY="$(./scripts/mint-api-key.sh --tenant demo-user)"
+#
+# All flags are passed straight through to the CLI:
+#   --tenant <id>            (required)
+#   --env <development|...>  (default: $ENV or development)
+#   --expires <duration>     (e.g. 720h; 0/omitted = never)
+#   --scopes <a,b,c>         (optional, comma-separated)
+
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT_DIR"
+
+exec go run ./cmd/mint-api-key "$@"
diff --git a/scripts/test-webhooks.sh b/scripts/test-webhooks.sh
index 7d3c408..f8cebff 100755
--- a/scripts/test-webhooks.sh
+++ b/scripts/test-webhooks.sh
@@ -10,32 +10,24 @@ set -euo pipefail
 
 API="http://localhost:5010"
 WEBHOOK_RECEIVER="http://webhook-receiver:8080"  # internal docker network URL
-ENCRYPTION_KEY="${ENCRYPTION_KEY:-change_me_to_a_32_byte_secret____}"
-
-echo "=== 1. Generate auth token ==="
-# Create a token for user "demo-user" using the same AES encryption the API uses.
-# For dev-testing, we call the API with a pre-generated token.
-TOKEN=$(python3 -c "
-import sys, os
-sys.path.insert(0, '.')
-# Simple AES-GCM token generation matching pkg/utils/crypt.go
-from cryptography.hazmat.primitives.ciphers.aead import AESGCM
-import base64, os as _os
-key = b'${ENCRYPTION_KEY}'
-nonce = _os.urandom(12)
-aes = AESGCM(key)
-ct = aes.encrypt(nonce, b'demo-user', None)
-token = base64.urlsafe_b64encode(nonce + ct).rstrip(b'=').decode()
-print(token)
-" 2>/dev/null || echo "MANUAL_TOKEN_NEEDED")
-
-if [ "$TOKEN" = "MANUAL_TOKEN_NEEDED" ]; then
-  echo "Could not auto-generate token. Set TOKEN env var manually."
-  echo "  export TOKEN=<your-bearer-token>"
+TENANT="${TENANT:-demo-user}"
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+# shellcheck source=/dev/null
+. "$ROOT_DIR/scripts/_apikey.sh"
+
+echo "=== 1. Mint API key ==="
+# Seed a scoped API key for the tenant directly into the containerized Postgres
+# (matches pkg/utils/apikey.go). Replaces the old inline AES token.
+TOKEN="$(mint_api_key "$TENANT")" || echo "MANUAL_TOKEN_NEEDED"
+
+if [ -z "$TOKEN" ] || [ "$TOKEN" = "MANUAL_TOKEN_NEEDED" ]; then
+  echo "Could not mint an API key. Mint one manually and set TOKEN:"
+  echo "  export TOKEN=\"\$(go run ./cmd/mint-api-key --tenant demo-user)\""
   exit 1
 fi
 
-echo "Token: ${TOKEN:0:20}..."
+echo "API key: ${TOKEN:0:20}..."
 AUTH="Authorization: Bearer $TOKEN"
 
 echo ""
diff --git a/worker/__init__.py b/worker/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/worker/consumer/config.py b/worker/consumer/config.py
index 2d824b7..c9b82b2 100644
--- a/worker/consumer/config.py
+++ b/worker/consumer/config.py
@@ -129,9 +129,14 @@ class WorkerConfig:
     log_level: str
     auto_migrate: bool
     migrations_dir: str
+    # migration_allow_destructive gates versions 7 and 8 which drop or alter
+    # existing user data. Must be true on first bootstrap of a fresh database.
+    migration_allow_destructive: bool = False
     consumer_group: str = "worker-group"
     max_concurrent_jobs: int = 5
     job_poll_interval: int = 10
+    recovery_min_idle_ms: int = 120000
+    dlq_stream_name: str = "media:jobs:dlq"
 
     @staticmethod
     def from_env() -> "WorkerConfig":
@@ -158,11 +163,14 @@ def from_env() -> "WorkerConfig":
             ),
             max_concurrent_jobs=int(os.getenv("MAX_CONCURRENT_JOBS", "5")),
             job_poll_interval=int(os.getenv("JOB_POLL_INTERVAL", "10")),
+            recovery_min_idle_ms=int(os.getenv("RECOVERY_MIN_IDLE_MS", "120000")),
+            dlq_stream_name=os.getenv("STREAM_DLQ_NAME", "media:jobs:dlq"),
             temp_dir=temp_dir,
             stream_name=os.getenv("STREAM_NAME", "media:jobs"),
             consumer_group=os.getenv("CONSUMER_GROUP", "worker-group"),
             log_level=os.getenv("LOG_LEVEL", "INFO"),
             auto_migrate=os.getenv("AUTO_MIGRATE", "false").lower() == "true",
+            migration_allow_destructive=os.getenv("MIGRATION_ALLOW_DESTRUCTIVE", "false").lower() == "true",
             migrations_dir=os.getenv("MIGRATIONS_DIR", default_migrations_dir),
         )
 
diff --git a/worker/consumer/consumer.py b/worker/consumer/consumer.py
index c66c990..8b83666 100644
--- a/worker/consumer/consumer.py
+++ b/worker/consumer/consumer.py
@@ -16,6 +16,24 @@
 - Stuck or missing stream messages are recovered by re-adding pending jobs
   back to the stream (simple requeue strategy).
 
+Concurrency model — thread pool (and why):
+- Per-job work is dominated by I/O and subprocesses, not pure-Python CPU:
+  object-store download/upload (network I/O, releases the GIL), ffmpeg invoked
+  via `subprocess` (a separate OS process — true parallelism regardless of the
+  GIL), Pillow (releases the GIL for most pixel ops), and psycopg calls (I/O).
+  A `ThreadPoolExecutor` therefore gives real concurrency here while keeping a
+  single shared `psycopg_pool.ConnectionPool` (thread-safe) and a single set of
+  OTel instruments (thread-safe; context is per-thread, so each task starts its
+  own `worker.consume` span cleanly).
+- A process pool was considered and rejected for now: it would force a DB/Redis
+  pool per process, require pickling the storage client across the process
+  boundary, and re-initialise OTel in every worker — significant cost for little
+  gain given how little time is spent in GIL-bound Python.
+- GIL escalation path: if profiling later shows pure-Python sections (e.g. large
+  file hashing, GIL-holding Pillow paths) dominate and threads stop scaling,
+  move the CPU-bound stage to a ProcessPoolExecutor (hybrid: threads for I/O,
+  processes for transform) rather than converting the whole consumer.
+
 Notes about external expectations:
 - `pg_pool` must expose `connect_pg()` returning a DB connection context manager
   compatible with `psycopg` (connection yields `cursor()` and supports commit/rollback).
@@ -27,17 +45,25 @@
 
 from __future__ import annotations
 
+import threading
 import time
-from typing import Dict
+from concurrent.futures import Future, ThreadPoolExecutor
+from concurrent.futures import wait as futures_wait
+from contextlib import nullcontext
+from typing import Dict, Set
 
 import redis
 from redis.exceptions import ResponseError
+from opentelemetry import trace
+from opentelemetry.propagate import extract
 
 from worker.consumer.config import WorkerConfig
 from worker.consumer.db import PgPool
 from worker.processing.processor import RetryableException, process_asset_dispatch
 from worker.storage.base import StorageX
 from worker.utils.logger import get_logger
+from worker.utils.tracing import get_tracer
+from worker.utils import metrics as wm
 from worker.webhooks import insert_webhook_deliveries
 
 logger = get_logger(__name__)
@@ -92,12 +118,27 @@ def __init__(
         self.storage = storage
         self.cfg = cfg
 
+        # Bounded worker pool. Honours MAX_CONCURRENT_JOBS (cfg.max_concurrent_jobs):
+        # up to that many jobs run concurrently, one per pool thread. In-flight
+        # work is tracked so the read loop only fetches as many new messages as
+        # there is free capacity, and so shutdown can drain deterministically.
+        self._max_workers = max(1, int(getattr(cfg, "max_concurrent_jobs", 1) or 1))
+        self._executor = ThreadPoolExecutor(
+            max_workers=self._max_workers, thread_name_prefix="job"
+        )
+        self._inflight_lock = threading.Lock()
+        self._inflight = 0
+        self._futures: Set[Future] = set()
+        self._closed = False
+
         # Periodic recovery state. _last_recovery = 0 makes recovery run on the
         # first consume() so leftovers from a prior crash are swept at startup.
-        # The interval matches the 2-minute staleness threshold in the recovery
-        # query. See DEV-35.
+        # The cadence matches the XAUTOCLAIM min-idle threshold below. See DEV-35.
         self._last_recovery = 0.0
         self._recovery_interval = 120.0
+        # Minimum idle time (ms) before a pending message is eligible to be
+        # reclaimed from a (presumed dead) consumer via XAUTOCLAIM.
+        self._recovery_min_idle_ms = int(getattr(cfg, "recovery_min_idle_ms", 120000))
 
         # Ensure the consumer group exists. If it already exists Redis raises an
         # error; ignore that specific error.
@@ -117,13 +158,22 @@ def __init__(
         except OSError as exc:
             logger.warning("could not write health sentinel: %s", exc)
 
+        # Expose the dead-letter stream depth as an observable gauge so DLQ
+        # accumulation is visible on the dashboards (no-op if telemetry is off).
+        try:
+            wm.register_dlq_depth_gauge(
+                lambda: self.redis.xlen(self.cfg.dlq_stream_name)
+            )
+        except Exception as exc:  # never let telemetry wiring break startup
+            logger.warning("could not register DLQ depth gauge: %s", exc)
+
     def consume(self, consumer_name: str) -> bool:
-        """Poll the stream and process a single message.
+        """Top up the worker pool with new stream messages.
 
-        This blocks briefly while waiting for messages. When a message is returned,
-        it can contain either `job_id` or `asset_id` in its payload. `job_id` is
-        preferred; if `asset_id` is present, the method ensures a job row exists
-        before delegating to the job handler.
+        Reads up to the current free capacity (MAX_CONCURRENT_JOBS minus in-flight)
+        and submits each message to the thread pool, where it is processed
+        concurrently. Each message carries either `job_id` (preferred) or
+        `asset_id`; dispatch + per-message ack happen inside the task.
 
         Parameters
         ----------
@@ -133,21 +183,28 @@ def consume(self, consumer_name: str) -> bool:
         Returns
         -------
         bool
-            True if a message was consumed (even if processing failed), False if
-            no messages were available.
+            True if at least one message was read and submitted, False when there
+            was no free capacity or no messages were available. The caller should
+            sleep briefly on False to avoid a busy loop.
         """
         # Recover stuck jobs on a fixed cadence, independent of load. Doing this
         # only on the idle path meant recovery never ran under sustained load —
         # exactly when crashed-mid-job rows are most likely. See DEV-35.
-        self._maybe_recover()
+        self._maybe_recover(consumer_name)
+
+        # Only fetch as many messages as we can actually start right now. When at
+        # capacity we return immediately (don't hold a 5s blocking read open while
+        # full) so freed slots are picked up promptly on the next call.
+        free = self._free_capacity()
+        if free <= 0:
+            return False
 
-        # Read one message for this consumer (blocking short period)
         try:
             resp = self.redis.xreadgroup(
                 groupname=self.cfg.consumer_group,
                 consumername=consumer_name,
                 streams={self.cfg.stream_name: ">"},
-                count=1,
+                count=free,
                 block=5000,
             )
         except (TimeoutError, redis.exceptions.TimeoutError):
@@ -158,15 +215,53 @@ def consume(self, consumer_name: str) -> bool:
 
         # Response format: [(stream_name, [(msg_id, {field: value}), ...])]
         _, messages = resp[0]
-        msg_id, fields = messages[0]
+        for msg_id, fields in messages:
+            self._submit(msg_id, fields)
+
+        return len(messages) > 0
+
+    def _free_capacity(self) -> int:
+        """Number of additional jobs that can be started right now."""
+        with self._inflight_lock:
+            return self._max_workers - self._inflight
 
+    def _submit(self, msg_id: str, fields: Dict[str, str]) -> None:
+        """Reserve a slot and submit one message to the pool for processing."""
+        with self._inflight_lock:
+            self._inflight += 1
+        try:
+            future = self._executor.submit(self._process_message, msg_id, dict(fields))
+        except RuntimeError:
+            # Executor already shut down (during drain). Release the slot; the
+            # message stays in the PEL and is reclaimed by recovery later.
+            with self._inflight_lock:
+                self._inflight -= 1
+            return
+        self._futures.add(future)
+        future.add_done_callback(self._on_task_done)
+
+    def _on_task_done(self, future: Future) -> None:
+        """Release the in-flight slot when a task finishes (success or failure)."""
+        self._futures.discard(future)
+        with self._inflight_lock:
+            self._inflight -= 1
+
+    def _process_message(self, msg_id: str, fields: Dict[str, str]) -> None:
+        """Process a single stream message inside a pool thread.
+
+        Each task starts its OWN `worker.consume` span (carrying this message's
+        extracted trace context) so concurrent jobs never share a span and the
+        per-asset Tempo waterfalls stay separate. The message is acked by its own
+        msg_id only on success (inside `_handle_job`); on failure it is left in
+        the PEL for recovery.
+        """
+        wm.record_consume()
         try:
             # Normalize fields to a dict
             payload: Dict[str, str] = {k: fields[k] for k in fields}
             logger.info("message received id=%s payload=%s", msg_id, payload)
 
             body = payload.get("body")
-            # logger.debug("message body: %s", body)
             if body:
                 # If a body field is present, it contains a JSON-encoded dict
                 import json
@@ -175,25 +270,97 @@ def consume(self, consumer_name: str) -> bool:
                 payload.update(body_dict)
                 payload.pop("body")
 
-            # logger.debug("normalized payload: %s", payload)
-
             job_id = payload.get("job_id")
             asset_id = payload.get("asset_id")
 
-            if job_id:
-                self._handle_job(job_id, msg_id)
-            elif asset_id:
-                self._handle_asset_message(asset_id, msg_id)
-            else:
-                logger.error("message missing job_id and asset_id: %s", payload)
-                # Acknowledge to remove the malformed message from the stream.
-                self.redis.xack(self.cfg.stream_name, self.cfg.consumer_group, msg_id)
+            # Extract the producer trace context (injected by the Go relay) and
+            # continue the trace here, starting the consume span INSIDE this task
+            # thread. See _consume_span.
+            with self._consume_span(payload, msg_id, job_id, asset_id):
+                if job_id:
+                    self._handle_job(job_id, msg_id)
+                elif asset_id:
+                    self._handle_asset_message(asset_id, msg_id)
+                else:
+                    logger.error("message missing job_id and asset_id: %s", payload)
+                    # Acknowledge to remove the malformed message from the stream.
+                    self.redis.xack(
+                        self.cfg.stream_name, self.cfg.consumer_group, msg_id
+                    )
         except Exception:
             logger.exception("unhandled exception while processing message %s", msg_id)
             # Do not ack the message so it remains in the pending entries list
             # for recovery/retry later.
 
-        return True
+    def _await_inflight(self, timeout: float | None = None):
+        """Block until all in-flight tasks finish or `timeout` elapses.
+
+        Returns the (done, not_done) future sets from concurrent.futures.wait.
+        Used by graceful shutdown and by tests to make submission deterministic.
+        """
+        pending = list(self._futures)
+        if not pending:
+            return set(), set()
+        return futures_wait(pending, timeout=timeout)
+
+    def shutdown(self, timeout: float = 30.0) -> None:
+        """Stop accepting work and drain in-flight tasks, bounded by `timeout`.
+
+        Aligns with the container stop_grace_period: we wait up to `timeout` for
+        running jobs to finish, then stop. Any task still running is abandoned —
+        its Redis message stays unacked in the PEL and is safely reclaimed by
+        XAUTOCLAIM recovery on the next worker to run.
+        """
+        if self._closed:
+            return
+        self._closed = True
+        _, not_done = self._await_inflight(timeout=timeout)
+        if not_done:
+            logger.warning(
+                "shutdown: %d job(s) still running after %.0fs; abandoning "
+                "(messages remain in PEL for recovery)",
+                len(not_done),
+                timeout,
+            )
+        # Don't block again on the abandoned tasks.
+        self._executor.shutdown(wait=False)
+
+    def _consume_span(self, payload, msg_id, job_id, asset_id):
+        """Start the worker.consume span continuing the producer trace.
+
+        Returns a context manager. When tracing is not initialised (telemetry
+        failed at startup) this is a no-op so message processing is unaffected.
+        """
+        tracer = get_tracer()
+        if tracer is None:
+            return nullcontext()
+
+        carrier = {
+            k: payload[k]
+            for k in ("traceparent", "tracestate", "baggage")
+            if k in payload
+        }
+        parent_ctx = extract(carrier)
+        producer_sc = trace.get_current_span(parent_ctx).get_span_context()
+        links = [trace.Link(producer_sc)] if producer_sc.is_valid else None
+
+        attrs = {
+            "messaging.system": "redis",
+            "messaging.destination.name": self.cfg.stream_name,
+            "messaging.message.id": msg_id,
+        }
+        if job_id:
+            attrs["job_id"] = str(job_id)
+        if asset_id:
+            attrs["asset_id"] = str(asset_id)
+
+        return tracer.start_as_current_span(
+            "worker.consume",
+            context=parent_ctx,
+            kind=trace.SpanKind.CONSUMER,
+            links=links,
+            attributes=attrs,
+        )
 
     def _handle_job(self, job_id: int, msg_id: str) -> None:
         """Load the job row, mark it in-progress, run processing, and finalize.
@@ -233,10 +400,12 @@ def _handle_job(self, job_id: int, msg_id: str) -> None:
             conn.commit()
 
         # Run the processing outside the DB transaction.
+        job_start = time.time()
         try:
             process_asset_dispatch(asset_id, self.pg, self.storage, self.cfg)
         except Exception as exc:
             logger.exception("processing failed for job=%s asset=%s", job_id, asset_id)
+            wm.record_job(success=False, duration_seconds=time.time() - job_start)
 
             with self.pg.get_pg_conn() as conn:
                 cur = conn.cursor()
@@ -252,7 +421,8 @@ def _handle_job(self, job_id: int, msg_id: str) -> None:
                 # fail it immediately instead of burning the whole retry budget.
                 retryable = isinstance(exc, RetryableException)
 
-                if not retryable or attempts_now >= self.cfg.redis.max_retries:
+                permanent = not retryable or attempts_now >= self.cfg.redis.max_retries
+                if permanent:
                     cur.execute(
                         "UPDATE jobs SET status = 'failed', last_error = %s, updated_at = now() WHERE job_id = %s",
                         (str(exc), str(job_id)),
@@ -268,7 +438,26 @@ def _handle_job(self, job_id: int, msg_id: str) -> None:
                         (str(exc), str(job_id)),
                     )
                 conn.commit()
-            # Leave the Redis message unacked so it remains in the pending list.
+
+            if permanent:
+                # Poison message: route to the dead-letter stream with failure
+                # metadata for inspection/replay, then ACK the original so it
+                # stops being redelivered. (Previously the message was left
+                # unacked here, so a permanently-failed job lingered in the PEL
+                # and got reclaimed forever.)
+                self._dead_letter(
+                    msg_id,
+                    {
+                        "job_id": str(job_id),
+                        "asset_id": str(asset_id),
+                        "error": str(exc),
+                        "attempts": str(attempts_now),
+                        "original_msg_id": str(msg_id),
+                        "failed_at": str(time.time()),
+                    },
+                )
+            # Retryable failures are left unacked so they remain in the PEL and
+            # are picked up again by XAUTOCLAIM recovery.
             return
 
         # On success, mark job done and mark related asset ready.
@@ -285,6 +474,8 @@ def _handle_job(self, job_id: int, msg_id: str) -> None:
             insert_webhook_deliveries(cur, asset_id, job_id, "job.done")
             conn.commit()
 
+        wm.record_job(success=True, duration_seconds=time.time() - job_start)
+
         # Acknowledge the Redis stream message.
         self.redis.xack(self.cfg.stream_name, self.cfg.consumer_group, msg_id)
 
@@ -343,7 +534,7 @@ def _handle_asset_message(self, asset_id: str, msg_id: str) -> None:
         # Delegate to _handle_job using the job id we now have.
         self._handle_job(job_id, msg_id)
 
-    def _maybe_recover(self) -> None:
+    def _maybe_recover(self, consumer_name: str | None = None) -> None:
         """Run stuck-job recovery if the recovery interval has elapsed.
 
         Time-gated so recovery fires on a fixed cadence regardless of whether
@@ -352,26 +543,106 @@ def _maybe_recover(self) -> None:
         now = time.time()
         if now - self._last_recovery >= self._recovery_interval:
             self._last_recovery = now
-            self._recover_stuck_pending()
+            self._recover_stuck_pending(consumer_name)
 
-    def _recover_stuck_pending(self) -> None:
-        """Requeue stale pending/in_progress jobs back onto the stream.
+    def _dead_letter(self, msg_id: str, fields: Dict[str, str]) -> None:
+        """Move a poison message to the dead-letter stream and ack the original.
 
-        This is a conservative recovery strategy: find jobs that appear stuck
-        (older than a configured threshold) and push a message for each back to
-        the Redis stream so consumer groups can pick them up again.
+        XADD the failure metadata to `dlq_stream_name`, then XACK the original
+        message so it leaves the main stream's PEL. Best-effort: if Redis errors
+        here we log and leave the message unacked (recovery will retry), rather
+        than losing it.
         """
-        with self.pg.get_pg_conn() as conn:
-            cur = conn.cursor()
-            cur.execute(
-                "SELECT job_id, asset_id, status FROM jobs WHERE status IN ('pending','in_progress') AND updated_at < now() - interval '2 minutes'",
+        try:
+            self.redis.xadd(self.cfg.dlq_stream_name, fields)
+            self.redis.xack(self.cfg.stream_name, self.cfg.consumer_group, msg_id)
+            logger.warning(
+                "message %s dead-lettered to %s", msg_id, self.cfg.dlq_stream_name
             )
-            rows = cur.fetchall()
+        except redis.exceptions.RedisError:
+            logger.exception("failed to dead-letter message %s", msg_id)
 
-            for jid, asset_id, status in rows:
-                logger.info(
-                    "requeueing job %s asset %s status %s", jid, asset_id, status
+    def _recover_stuck_pending(self, consumer_name: str | None = None) -> None:
+        """Reclaim messages stuck in the PEL of dead consumers via XAUTOCLAIM.
+
+        Uses Redis Streams' own delivery state instead of scanning Postgres: any
+        message idle longer than `recovery_min_idle_ms` (i.e. delivered to a
+        consumer that never acked it — typically because that consumer crashed)
+        is transferred to THIS consumer and re-dispatched through the same bounded
+        pool. Idempotency is still guaranteed downstream by the `SELECT ... FOR
+        UPDATE` job claim and the `status == 'done'` short-circuit in _handle_job.
+
+        Only reclaims up to the current free capacity so recovery never overruns
+        the in-flight cap; remaining stuck messages are picked up on later passes.
+        """
+        if consumer_name is None:
+            consumer_name = getattr(self.cfg, "worker_id", None) or "recovery"
+
+        free = self._free_capacity()
+        if free <= 0:
+            return
+
+        try:
+            result = self.redis.xautoclaim(
+                name=self.cfg.stream_name,
+                groupname=self.cfg.consumer_group,
+                consumername=consumer_name,
+                min_idle_time=self._recovery_min_idle_ms,
+                start_id="0-0",
+                count=free,
+            )
+        except (ResponseError, redis.exceptions.RedisError) as exc:
+            logger.warning("xautoclaim recovery failed: %s", exc)
+            return
+
+        # redis-py returns (next_cursor, claimed_messages) on older versions and
+        # (next_cursor, claimed_messages, deleted_ids) on newer ones.
+        messages = result[1] if len(result) >= 2 else []
+        # Cap on how many times a message may be reclaimed before it is treated
+        # as poison. A message that keeps being reclaimed but never acked (e.g. a
+        # job that crashes the worker every time) would otherwise loop forever.
+        max_deliveries = int(getattr(self.cfg.redis, "max_retries", 5))
+        for msg_id, fields in messages:
+            if not fields:
+                # Entry was deleted from the stream but lingered in the PEL; ack
+                # to clear it so it stops being reported as pending.
+                self.redis.xack(self.cfg.stream_name, self.cfg.consumer_group, msg_id)
+                continue
+
+            deliveries = self._delivery_count(msg_id)
+            if deliveries > max_deliveries:
+                dlq_fields = dict(fields)
+                dlq_fields.update(
+                    {
+                        "original_msg_id": str(msg_id),
+                        "deliveries": str(deliveries),
+                        "reason": "max_deliveries_exceeded",
+                        "failed_at": str(time.time()),
+                    }
                 )
-                payload = {"job_id": str(jid), "asset_id": str(asset_id)}
-                # XADD will append a new message; deduping is handled by the jobs table.
-                self.redis.xadd(self.cfg.stream_name, payload)
+                self._dead_letter(msg_id, dlq_fields)
+                continue
+
+            logger.info("reclaimed idle message id=%s for redispatch", msg_id)
+            self._submit(msg_id, fields)
+
+    def _delivery_count(self, msg_id: str) -> int:
+        """Return how many times `msg_id` has been delivered (from XPENDING).
+
+        Used to detect messages that are repeatedly reclaimed but never acked so
+        they can be dead-lettered. Returns 0 on any error (fail open: prefer
+        re-dispatch over erroneously dead-lettering).
+        """
+        try:
+            pending = self.redis.xpending_range(
+                self.cfg.stream_name,
+                self.cfg.consumer_group,
+                min=msg_id,
+                max=msg_id,
+                count=1,
+            )
+            if pending:
+                return int(pending[0].get("times_delivered", 0))
+        except redis.exceptions.RedisError as exc:
+            logger.warning("xpending lookup failed for %s: %s", msg_id, exc)
+        return 0
diff --git a/worker/consumer/db.py b/worker/consumer/db.py
index 838079d..45ba705 100644
--- a/worker/consumer/db.py
+++ b/worker/consumer/db.py
@@ -3,8 +3,24 @@
 
 
 class PgPool:
-    def __init__(self, dsn):
-        self._pool = ConnectionPool(conninfo=dsn, max_size=10, open=True)
+    """Thin wrapper over psycopg_pool.ConnectionPool.
+
+    `max_size` must be sized to the worker concurrency: each in-flight job holds
+    at most one connection at a time, so the pool needs at least
+    MAX_CONCURRENT_JOBS connections plus a little headroom for recovery/recovery
+    bookkeeping queries that may run alongside job processing. Under-sizing the
+    pool would silently cap effective concurrency (jobs would block waiting for a
+    connection) — watch mpiper_db_connections_wait_count if you suspect this.
+    """
+
+    def __init__(self, dsn, max_size: int = 10):
+        size = max(1, int(max_size))
+        # psycopg_pool defaults min_size=4; clamp it under max_size so small
+        # pools (low MAX_CONCURRENT_JOBS) don't violate min_size <= max_size.
+        min_size = min(4, size)
+        self._pool = ConnectionPool(
+            conninfo=dsn, min_size=min_size, max_size=size, open=True
+        )
 
     @contextmanager
     def get_pg_conn(self):
diff --git a/worker/consumer/main.py b/worker/consumer/main.py
index 673298c..5b74b24 100644
--- a/worker/consumer/main.py
+++ b/worker/consumer/main.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import signal
 import time
 
@@ -10,6 +11,7 @@
 from worker.consumer.migrations import run_migrations
 from worker.storage import get_storage
 from worker.utils import metrics as worker_metrics
+from worker.utils import tracing as worker_tracing
 
 logger = logging.getLogger(__name__)
 
@@ -19,6 +21,33 @@ def main():
     logger.info("Starting worker consumer...")
 
     cfg = get_config()
+
+    # Initialise telemetry before anything else so startup is observable.
+    # NOTE: init_metrics() was previously never called — worker OTel metrics
+    # were defined but never wired up. We initialise both tracing and metrics
+    # here from the same OtelConfig so they share endpoint/resource/lifecycle.
+    otel = cfg.otel
+    try:
+        worker_tracing.init_tracing(
+            service_name=otel.service_name,
+            service_version=otel.service_version,
+            endpoint=otel.endpoint,
+            deployment_env=otel.deployment_env,
+            instance_id=otel.instance_id,
+            tls_insecure=otel.tls_insecure,
+        )
+        worker_metrics.init_metrics(
+            service_name=otel.service_name,
+            service_version=otel.service_version,
+            endpoint=otel.endpoint,
+            deployment_env=otel.deployment_env,
+            instance_id=otel.instance_id,
+            tls_insecure=otel.tls_insecure,
+        )
+    except Exception:
+        # Telemetry must never prevent the worker from processing jobs.
+        logger.exception("failed to initialise telemetry; continuing without it")
+
     storage = get_storage(cfg)
     password = quote_plus(cfg.database.password)
 
@@ -29,10 +58,24 @@ def main():
 
     if cfg.auto_migrate:
         logger.info("AUTO_MIGRATE=true: running migrations")
-        run_migrations(dsn, migrations_dir=cfg.migrations_dir)
+        run_migrations(
+            dsn,
+            migrations_dir=cfg.migrations_dir,
+            allow_destructive=cfg.migration_allow_destructive,
+        )
         logger.info("Migrations applied successfully")
 
-    pg = PgPool(dsn=dsn)
+    # Size the DB pool to the worker concurrency. Each in-flight job holds at
+    # most one connection at a time, so MAX_CONCURRENT_JOBS connections plus a
+    # small headroom (recovery/bookkeeping queries) avoids jobs blocking on the
+    # pool while staying well under Postgres' connection limit.
+    db_pool_size = max(1, cfg.max_concurrent_jobs) + 2
+    pg = PgPool(dsn=dsn, max_size=db_pool_size)
+    logger.info(
+        "db pool sized to %d (max_concurrent_jobs=%d + 2 headroom)",
+        db_pool_size,
+        cfg.max_concurrent_jobs,
+    )
     consumer = Consumer(
         pg_pool=pg, storage=storage, redis_url=cfg.redis.connection_string, cfg=cfg
     )
@@ -50,18 +93,29 @@ def _term(signum, frame):
     logger.info("starting job loop")
     while not shutdown:
         try:
-            processed = consumer.consume(
-                cfg.stream_name
-            )  # single iteration --- returns True if did work
-            if not processed:
-                time.sleep(cfg.job_poll_interval)
+            # consume() tops up the bounded worker pool with as many new messages
+            # as there is free capacity, then returns. It returns False when the
+            # pool is full or no messages were available — sleep briefly so freed
+            # slots are picked up promptly without busy-spinning.
+            did_work = consumer.consume(cfg.worker_id)
+            if not did_work:
+                time.sleep(min(cfg.job_poll_interval, 0.5))
         except Exception:
             logger.exception("unhandled error in loop")
             time.sleep(1)
 
+    logger.info("draining in-flight jobs before exit")
+    # Bounded drain: wait up to SHUTDOWN_DRAIN_TIMEOUT seconds for running jobs
+    # to finish. Anything still running is abandoned (its message stays in the
+    # PEL) and reclaimed by XAUTOCLAIM recovery later. Keep this <= the container
+    # stop_grace_period so we shut down cleanly instead of being SIGKILLed.
+    drain_timeout = float(os.getenv("SHUTDOWN_DRAIN_TIMEOUT", "30"))
+    consumer.shutdown(timeout=drain_timeout)
+
     logger.info("exiting")
-    
-    # Shutdown metrics on exit
+
+    # Shutdown telemetry on exit (flush pending spans + metrics).
+    worker_tracing.shutdown_tracing()
     worker_metrics.shutdown_metrics()
 
 
diff --git a/worker/consumer/migrations.py b/worker/consumer/migrations.py
index 525503c..bcc71b4 100644
--- a/worker/consumer/migrations.py
+++ b/worker/consumer/migrations.py
@@ -31,8 +31,23 @@ def _migration_files(migrations_dir: Path):
     return result
 
 
-def run_migrations(dsn: str, migrations_dir: str | None = None) -> None:
-    """Apply all pending migrations from migrations_dir against the given DSN."""
+# Versions that drop or alter existing user data. They must be opted into
+# explicitly via allow_destructive=True (driven by MIGRATION_ALLOW_DESTRUCTIVE)
+# so a fresh database bootstrap never silently wipes data.
+_DESTRUCTIVE_VERSIONS = {7, 8}
+
+
+def run_migrations(
+    dsn: str,
+    migrations_dir: str | None = None,
+    allow_destructive: bool = False,
+) -> None:
+    """Apply all pending migrations from migrations_dir against the given DSN.
+
+    Destructive migrations (versions 7 and 8) are refused unless
+    allow_destructive=True; the check runs against the file-system pending
+    list before any database connection is opened.
+    """
     if migrations_dir is None:
         migrations_dir = os.getenv(
             "MIGRATIONS_DIR",
@@ -43,13 +58,26 @@ def run_migrations(dsn: str, migrations_dir: str | None = None) -> None:
     if not path.is_dir():
         raise RuntimeError(f"Migrations directory not found: {path}")
 
+    pending = _migration_files(path)
+
+    if not allow_destructive:
+        # Filenames are zero-padded ("000007_…") but _DESTRUCTIVE_VERSIONS is
+        # expressed in canonical numeric form; normalise via int() so either
+        # padding compares equal.
+        destructive_pending = sorted(
+            {v for v, _ in pending if int(v) in _DESTRUCTIVE_VERSIONS}
+        )
+        if destructive_pending:
+            raise RuntimeError(
+                f"destructive migrations {destructive_pending} are pending. "
+                f"Set MIGRATION_ALLOW_DESTRUCTIVE=true to apply them"
+            )
+
     with psycopg.connect(dsn) as conn:
         conn.autocommit = True
         with conn.cursor() as cur:
             cur.execute(_TRACKING_TABLE)
 
-        pending = _migration_files(path)
-
         for version, sql_file in pending:
             with conn.cursor() as cur:
                 cur.execute(
diff --git a/worker/processing/images.py b/worker/processing/images.py
index e4c03f5..0d3a6b5 100644
--- a/worker/processing/images.py
+++ b/worker/processing/images.py
@@ -3,7 +3,10 @@
 import logging
 import os
 
+from opentelemetry import trace
+
 logger = logging.getLogger("images")
+tracer = trace.get_tracer("worker.processing.images")
 
 IMAGE_VARIANTS = [
     {
@@ -46,6 +49,7 @@ def encode_image(img: Image.Image, fmt: str, quality: int = 80) -> bytes:
 
 def process_image_file(
     asset_id: str,
+    owner_id: str,
     local_raw_path: str,
     content_hash: str,
     pg_pool,
@@ -68,40 +72,48 @@ def process_image_file(
             role = v["role"]
             logger.info("generating image variant %s for asset %s", role, asset_id)
 
-            if v["crop"]:
-                out_img = ImageOps.fit(
-                    img,
-                    (v["width"], v["height"]),
-                    Image.LANCZOS,
-                    centering=(0.5, 0.5),
-                )
-            else:
-                target_w = v["width"] or src_width
-                ratio = target_w / float(src_width)
-                target_h = int(src_height * ratio)
-                out_img = img.resize((target_w, target_h), Image.LANCZOS)
-
-            data = encode_image(out_img, v["format"], v["quality"])
-
-            key = f"media/processed/{asset_id}/{role}.{v['format']}"
-            storage.upload_bytes(key, data, content_type=f"image/{v['format']}")
-            url = storage.public_url(key)
-
-            # Upsert into variants.image (PK is asset_id + role)
-            with pg_pool.get_pg_conn() as conn:
-                conn.cursor().execute(
-                    """
-                    INSERT INTO variants.image (asset_id, url, role, width, height, size_bytes, format)
-                    VALUES (%s, %s, %s, %s, %s, %s, %s)
-                    ON CONFLICT (asset_id, role) DO UPDATE SET
-                        url = EXCLUDED.url,
-                        width = EXCLUDED.width,
-                        height = EXCLUDED.height,
-                        size_bytes = EXCLUDED.size_bytes,
-                        format = EXCLUDED.format
-                    """,
-                    (asset_id, url, role, out_img.width, out_img.height, len(data), v["format"]),
-                )
+            with tracer.start_as_current_span("image.variant") as span:
+                span.set_attribute("asset_id", asset_id)
+                span.set_attribute("variant.role", role)
+                span.set_attribute("variant.format", v["format"])
+
+                if v["crop"]:
+                    out_img = ImageOps.fit(
+                        img,
+                        (v["width"], v["height"]),
+                        Image.LANCZOS,
+                        centering=(0.5, 0.5),
+                    )
+                else:
+                    target_w = v["width"] or src_width
+                    ratio = target_w / float(src_width)
+                    target_h = int(src_height * ratio)
+                    out_img = img.resize((target_w, target_h), Image.LANCZOS)
+
+                data = encode_image(out_img, v["format"], v["quality"])
+                span.set_attribute("variant.width", out_img.width)
+                span.set_attribute("variant.height", out_img.height)
+                span.set_attribute("variant.size_bytes", len(data))
+
+                key = f"media/{owner_id}/processed/{asset_id}/{role}.{v['format']}"
+                storage.upload_bytes(key, data, content_type=f"image/{v['format']}")
+                url = storage.public_url(key)
+
+                # Upsert into variants.image (PK is asset_id + role)
+                with pg_pool.get_pg_conn() as conn:
+                    conn.cursor().execute(
+                        """
+                        INSERT INTO variants.image (asset_id, url, role, width, height, size_bytes, format)
+                        VALUES (%s, %s, %s, %s, %s, %s, %s)
+                        ON CONFLICT (asset_id, role) DO UPDATE SET
+                            url = EXCLUDED.url,
+                            width = EXCLUDED.width,
+                            height = EXCLUDED.height,
+                            size_bytes = EXCLUDED.size_bytes,
+                            format = EXCLUDED.format
+                        """,
+                        (asset_id, url, role, out_img.width, out_img.height, len(data), v["format"]),
+                    )
 
     # Mark asset ready
     with pg_pool.get_pg_conn() as conn:
diff --git a/worker/processing/processor.py b/worker/processing/processor.py
index 9e6244f..b0a4890 100644
--- a/worker/processing/processor.py
+++ b/worker/processing/processor.py
@@ -1,6 +1,9 @@
 import os
+import time
 from enum import Enum
 
+from opentelemetry import trace
+
 from worker.consumer.config import WorkerConfig
 from worker.consumer.db import PgPool
 from worker.processing.images import process_image_file
@@ -8,8 +11,10 @@
 from worker.storage.base import StorageX
 from worker.utils.hash import compute_file_hash
 from worker.utils.logger import get_logger
+from worker.utils import metrics as wm
 
 logger = get_logger(__name__)
+tracer = trace.get_tracer("worker.processing")
 
 
 class AssetStatus(Enum):
@@ -133,88 +138,122 @@ def process_asset_dispatch(
     """
     Main entry point for asset processing with deduplication.
     """
-    # 1. Load asset metadata
-    with pg_pool.get_pg_conn() as conn:
-        cur = conn.cursor()
-        cur.execute(
-            """
-            SELECT asset_id, type, status, original_url, mime_type, content_hash
-            FROM assets
-            WHERE asset_id = %s
-            """,
-            (asset_id,),
-        )
-        row = cur.fetchone()
-        if not row:
-            raise RuntimeError(f"Asset not found: {asset_id}")
-
-        _, typ, status, original_url, mime_type, content_hash = row
+    with tracer.start_as_current_span("process.dispatch") as span:
+        # asset_id is fine as a span attribute (high cardinality is OK on traces),
+        # but must NEVER become a metric label.
+        span.set_attribute("asset_id", asset_id)
 
-    # 2. Early exit if already processed
-    if status in (AssetStatus.READY.value, AssetStatus.DUPLICATE.value):
-        logger.info("Asset %s already in final state: %s", asset_id, status)
-        return
-
-    # 3. Proceed with processing
-    local_raw_file = None
-    try:
-        # Mark as processing
+        # 1. Load asset metadata
         with pg_pool.get_pg_conn() as conn:
             cur = conn.cursor()
             cur.execute(
-                "UPDATE assets SET status = %s WHERE asset_id = %s",
-                (AssetStatus.PROCESSING.value, asset_id)
+                """
+                SELECT asset_id, type, status, original_url, mime_type, content_hash, owner_id
+                FROM assets
+                WHERE asset_id = %s
+                """,
+                (asset_id,),
             )
+            row = cur.fetchone()
+            if not row:
+                raise RuntimeError(f"Asset not found: {asset_id}")
 
-        # Download raw file
-        raw_key = f"media/raw/{asset_id}"
-        tmp_dir = cfg.temp_dir
-        os.makedirs(tmp_dir, exist_ok=True)
-        local_raw_file = os.path.join(
-            tmp_dir, f"{asset_id}-raw.{get_extension_for_mime(mime_type)}"
-        )
-        storage.download_to_file(raw_key, local_raw_file)
+            _, typ, status, original_url, mime_type, content_hash, owner_id = row
+
+        if not owner_id:
+            raise RuntimeError(f"Asset has no owner: {asset_id}")
 
-        content_hash = compute_file_hash(local_raw_file)
+        span.set_attribute("asset.type", typ or "unknown")
+        span.set_attribute("asset.status", status or "unknown")
 
-        # Check for duplicate using the actual downloaded file's hash
-        if content_hash:
-            dedup_result = check_for_duplicate(content_hash, asset_id, pg_pool)
+        # 2. Early exit if already processed
+        if status in (AssetStatus.READY.value, AssetStatus.DUPLICATE.value):
+            logger.info("Asset %s already in final state: %s", asset_id, status)
+            span.set_attribute("dispatch.short_circuit", "already_final")
+            return
 
-            if dedup_result == DedupResult.DUPLICATE_READY:
-                logger.info("Asset %s deduplicated successfully", asset_id)
-                return
-            elif dedup_result == DedupResult.DUPLICATE_PENDING:
-                raise RetryableException(
-                    f"Canonical asset for {asset_id} not ready yet"
+        # 3. Proceed with processing
+        local_raw_file = None
+        proc_start = time.time()
+        try:
+            # Mark as processing
+            with pg_pool.get_pg_conn() as conn:
+                cur = conn.cursor()
+                cur.execute(
+                    "UPDATE assets SET status = %s WHERE asset_id = %s",
+                    (AssetStatus.PROCESSING.value, asset_id)
                 )
 
-        # Process based on type
-        if typ == "image":
-            process_image_file(
-                asset_id, local_raw_file, content_hash, pg_pool, storage, cfg
+            # Download raw file
+            raw_key = f"media/{owner_id}/raw/{asset_id}"
+            tmp_dir = cfg.temp_dir
+            os.makedirs(tmp_dir, exist_ok=True)
+            local_raw_file = os.path.join(
+                tmp_dir, f"{asset_id}-raw.{get_extension_for_mime(mime_type)}"
             )
-        elif typ == "video":
-            process_video_file(
-                asset_id, local_raw_file, content_hash, pg_pool, storage, cfg
-            )
-        else:
-            raise ValueError(f"Unknown asset type: {typ}")
-
-    except Exception as e:
-        # Do not touch assets.status here. The consumer (_handle_job) owns the
-        # asset state transition: it marks the asset failed only after the retry
-        # cap is hit, and ready on success. Writing 'failed' on every exception
-        # — including RetryableException — left the asset stuck failed across
-        # retries even though the job was still pending. See DEV-34.
-        logger.error("Failed to process asset %s: %s", asset_id, e, exc_info=True)
-        raise
-    finally:
-        if local_raw_file and os.path.exists(local_raw_file):
-            try:
-                os.unlink(local_raw_file)
-            except OSError:
-                logger.warning("Failed to delete temp file %s", local_raw_file)
+            with tracer.start_as_current_span("process.download") as dl_span:
+                dl_span.set_attribute("asset_id", asset_id)
+                dl_span.set_attribute("storage.key", raw_key)
+                storage.download_to_file(raw_key, local_raw_file)
+                try:
+                    dl_span.set_attribute(
+                        "download.size_bytes", os.path.getsize(local_raw_file)
+                    )
+                except OSError:
+                    pass
+
+            content_hash = compute_file_hash(local_raw_file)
+
+            # Check for duplicate using the actual downloaded file's hash
+            if content_hash:
+                with tracer.start_as_current_span("process.dedup_check") as dd_span:
+                    dd_span.set_attribute("asset_id", asset_id)
+                    dd_span.set_attribute("content_hash", content_hash)
+                    dedup_result = check_for_duplicate(content_hash, asset_id, pg_pool)
+                    dd_span.set_attribute("dedup.result", dedup_result.value)
+
+                if dedup_result == DedupResult.DUPLICATE_READY:
+                    logger.info("Asset %s deduplicated successfully", asset_id)
+                    span.set_attribute("dispatch.short_circuit", "deduplicated")
+                    return
+                elif dedup_result == DedupResult.DUPLICATE_PENDING:
+                    raise RetryableException(
+                        f"Canonical asset for {asset_id} not ready yet"
+                    )
+
+            # Process based on type
+            proc_start = time.time()
+            if typ == "image":
+                process_image_file(
+                    asset_id, owner_id, local_raw_file, content_hash, pg_pool, storage, cfg
+                )
+            elif typ == "video":
+                process_video_file(
+                    asset_id, owner_id, local_raw_file, content_hash, pg_pool, storage, cfg
+                )
+            else:
+                raise ValueError(f"Unknown asset type: {typ}")
+
+            # Asset processing duration, labelled by type only (low cardinality).
+            # Feeds the image/video "ready latency" SLIs. asset_id stays a span
+            # attribute, never a metric label.
+            wm.record_asset(typ, time.time() - proc_start, success=True)
+
+        except Exception as e:
+            # Do not touch assets.status here. The consumer (_handle_job) owns the
+            # asset state transition: it marks the asset failed only after the retry
+            # cap is hit, and ready on success. Writing 'failed' on every exception
+            # — including RetryableException — left the asset stuck failed across
+            # retries even though the job was still pending. See DEV-34.
+            logger.error("Failed to process asset %s: %s", asset_id, e, exc_info=True)
+            wm.record_asset(typ, time.time() - proc_start, success=False)
+            raise
+        finally:
+            if local_raw_file and os.path.exists(local_raw_file):
+                try:
+                    os.unlink(local_raw_file)
+                except OSError:
+                    logger.warning("Failed to delete temp file %s", local_raw_file)
 
 
 def clone_image_variants(cur, canonical_asset_id: str, new_asset_id: str) -> int:
diff --git a/worker/processing/videos.py b/worker/processing/videos.py
index 4432e6a..8150304 100644
--- a/worker/processing/videos.py
+++ b/worker/processing/videos.py
@@ -5,15 +5,25 @@
 import subprocess
 import tempfile
 
+from opentelemetry import trace
+
 logger = logging.getLogger("videos")
+tracer = trace.get_tracer("worker.processing.videos")
 
 
 def run(cmd: list[str]) -> None:
     """Execute ffmpeg command with error handling."""
     logger.info("ffmpeg: %s", " ".join(cmd))
-    res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if res.returncode != 0:
-        raise RuntimeError(f"FFmpeg failed: {res.stderr.decode()}")
+    with tracer.start_as_current_span("ffmpeg.exec") as span:
+        # cmd[0] is the binary (ffmpeg/ffprobe); record it without the full
+        # argv to avoid leaking paths as high-cardinality span attributes.
+        span.set_attribute("ffmpeg.binary", cmd[0] if cmd else "")
+        res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        span.set_attribute("ffmpeg.returncode", res.returncode)
+        if res.returncode != 0:
+            err = res.stderr.decode()
+            span.set_status(trace.StatusCode.ERROR, "ffmpeg failed")
+            raise RuntimeError(f"FFmpeg failed: {err}")
 
 
 def probe_video(local_path: str) -> dict:
@@ -42,7 +52,7 @@ def probe_video(local_path: str) -> dict:
     }
 
 
-def generate_poster(asset_id, local_raw_path, storage, cfg, pg_pool):
+def generate_poster(asset_id, owner_id, local_raw_path, storage, cfg, pg_pool):
     """Generate video poster thumbnail and store as image variant."""
     tmpdir = tempfile.mkdtemp(dir=cfg.temp_dir)
     try:
@@ -56,7 +66,7 @@ def generate_poster(asset_id, local_raw_path, storage, cfg, pg_pool):
         with open(out_path, "rb") as f:
             data = f.read()
 
-        key = f"media/processed/{asset_id}/poster.jpg"
+        key = f"media/{owner_id}/processed/{asset_id}/poster.jpg"
         storage.upload_bytes(key, data, content_type="image/jpeg")
         url = storage.public_url(key)
 
@@ -74,7 +84,7 @@ def generate_poster(asset_id, local_raw_path, storage, cfg, pg_pool):
         shutil.rmtree(tmpdir, ignore_errors=True)
 
 
-def transcode_720p(asset_id, local_raw_path, storage, cfg, pg_pool):
+def transcode_720p(asset_id, owner_id, local_raw_path, storage, cfg, pg_pool):
     """Transcode video to 720p H.264."""
     tmpdir = tempfile.mkdtemp(dir=cfg.temp_dir)
     try:
@@ -90,7 +100,7 @@ def transcode_720p(asset_id, local_raw_path, storage, cfg, pg_pool):
         with open(out_path, "rb") as f:
             data = f.read()
 
-        key = f"media/processed/{asset_id}/transcoded.mp4"
+        key = f"media/{owner_id}/processed/{asset_id}/transcoded.mp4"
         storage.upload_bytes(key, data, content_type="video/mp4")
         url = storage.public_url(key)
 
@@ -108,7 +118,7 @@ def transcode_720p(asset_id, local_raw_path, storage, cfg, pg_pool):
         shutil.rmtree(tmpdir, ignore_errors=True)
 
 
-def generate_preview(asset_id, local_raw_path, storage, cfg, pg_pool):
+def generate_preview(asset_id, owner_id, local_raw_path, storage, cfg, pg_pool):
     """Generate short muted preview clip."""
     tmpdir = tempfile.mkdtemp(dir=cfg.temp_dir)
     try:
@@ -123,7 +133,7 @@ def generate_preview(asset_id, local_raw_path, storage, cfg, pg_pool):
         with open(out_path, "rb") as f:
             data = f.read()
 
-        key = f"media/processed/{asset_id}/preview.mp4"
+        key = f"media/{owner_id}/processed/{asset_id}/preview.mp4"
         storage.upload_bytes(key, data, content_type="video/mp4")
         url = storage.public_url(key)
 
@@ -141,7 +151,7 @@ def generate_preview(asset_id, local_raw_path, storage, cfg, pg_pool):
         shutil.rmtree(tmpdir, ignore_errors=True)
 
 
-def process_video_file(asset_id, local_raw_path, content_hash, pg_pool, storage, cfg):
+def process_video_file(asset_id, owner_id, local_raw_path, content_hash, pg_pool, storage, cfg):
     """Main video processing pipeline."""
     logger.info("Processing video asset %s", asset_id)
 
@@ -152,9 +162,14 @@ def process_video_file(asset_id, local_raw_path, content_hash, pg_pool, storage,
             (content_hash, asset_id),
         )
 
-    generate_poster(asset_id, local_raw_path, storage, cfg, pg_pool)
-    transcode_720p(asset_id, local_raw_path, storage, cfg, pg_pool)
-    generate_preview(asset_id, local_raw_path, storage, cfg, pg_pool)
+    for stage_name, fn in (
+        ("video.poster", generate_poster),
+        ("video.transcode_720p", transcode_720p),
+        ("video.preview", generate_preview),
+    ):
+        with tracer.start_as_current_span(stage_name) as span:
+            span.set_attribute("asset_id", asset_id)
+            fn(asset_id, owner_id, local_raw_path, storage, cfg, pg_pool)
 
     # Mark asset ready
     with pg_pool.get_pg_conn() as conn:
diff --git a/worker/tests/test_consumer_pool.py b/worker/tests/test_consumer_pool.py
new file mode 100644
index 0000000..0439280
--- /dev/null
+++ b/worker/tests/test_consumer_pool.py
@@ -0,0 +1,90 @@
+import threading
+import time
+import unittest
+from unittest.mock import MagicMock, patch
+
+from worker.consumer.consumer import Consumer
+
+
+def _make_consumer(max_concurrent_jobs=2):
+    cfg = MagicMock()
+    cfg.stream_name = "media:jobs"
+    cfg.consumer_group = "media-workers"
+    cfg.max_concurrent_jobs = max_concurrent_jobs
+    with patch("worker.consumer.consumer.redis.Redis.from_url") as from_url:
+        client = MagicMock()
+        from_url.return_value = client
+        consumer = Consumer(
+            pg_pool=MagicMock(), redis_url="redis://x", storage=MagicMock(), cfg=cfg
+        )
+    # Recovery is exercised elsewhere; stub it out here.
+    consumer._recover_stuck_pending = MagicMock()
+    return consumer, client
+
+
+class TestBoundedPool(unittest.TestCase):
+    def test_reads_only_free_capacity_and_caps_inflight(self):
+        consumer, client = _make_consumer(max_concurrent_jobs=2)
+
+        # Two messages available; both handlers block so they stay in-flight.
+        client.xreadgroup.return_value = [
+            ("media:jobs", [("1-0", {"job_id": "1"}), ("2-0", {"job_id": "2"})])
+        ]
+        release = threading.Event()
+        consumer._handle_job = MagicMock(side_effect=lambda *_: release.wait(5))
+
+        self.assertTrue(consumer.consume("w"))
+        # Both slots taken -> no free capacity.
+        self.assertEqual(consumer._free_capacity(), 0)
+        # The read requested exactly the free capacity (2).
+        self.assertEqual(client.xreadgroup.call_args.kwargs["count"], 2)
+
+        # At capacity, consume() returns False WITHOUT issuing another read.
+        client.xreadgroup.reset_mock()
+        self.assertFalse(consumer.consume("w"))
+        client.xreadgroup.assert_not_called()
+
+        # Release the handlers and confirm capacity is restored.
+        release.set()
+        consumer._await_inflight(timeout=5)
+        self.assertEqual(consumer._free_capacity(), 2)
+
+    def test_failed_task_leaves_message_unacked(self):
+        consumer, client = _make_consumer()
+        client.xreadgroup.return_value = [("media:jobs", [("9-0", {"job_id": "7"})])]
+        consumer._handle_job = MagicMock(side_effect=RuntimeError("boom"))
+
+        consumer.consume("w")
+        consumer._await_inflight(timeout=5)
+
+        client.xack.assert_not_called()
+
+    def test_malformed_message_acked_by_msg_id(self):
+        consumer, client = _make_consumer()
+        # Neither job_id nor asset_id -> malformed; wrapper acks to drop it.
+        client.xreadgroup.return_value = [("media:jobs", [("5-0", {"foo": "bar"})])]
+
+        consumer.consume("w")
+        consumer._await_inflight(timeout=5)
+
+        client.xack.assert_called_once_with("media:jobs", "media-workers", "5-0")
+
+    def test_graceful_drain_waits_for_inflight(self):
+        consumer, client = _make_consumer()
+        client.xreadgroup.return_value = [("media:jobs", [("1-0", {"job_id": "1"})])]
+        done = threading.Event()
+
+        def slow(*_):
+            time.sleep(0.3)
+            done.set()
+
+        consumer._handle_job = MagicMock(side_effect=slow)
+        consumer.consume("w")
+
+        consumer.shutdown(timeout=5)
+        # A bounded-but-sufficient drain lets the in-flight job finish.
+        self.assertTrue(done.is_set())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/worker/tests/test_consumer_recovery.py b/worker/tests/test_consumer_recovery.py
index d76cb40..708e80b 100644
--- a/worker/tests/test_consumer_recovery.py
+++ b/worker/tests/test_consumer_recovery.py
@@ -10,6 +10,8 @@ def _make_consumer():
     cfg = MagicMock()
     cfg.stream_name = "media:jobs"
     cfg.consumer_group = "media-workers"
+    cfg.max_concurrent_jobs = 2
+    cfg.redis.max_retries = 5
     with patch("worker.consumer.consumer.redis.Redis.from_url") as from_url:
         client = MagicMock()
         from_url.return_value = client
@@ -33,6 +35,7 @@ def test_recovery_fires_under_load_when_interval_elapsed(self):
         consumer._last_recovery = 0.0
 
         result = consumer.consume("worker-1")
+        consumer._await_inflight(timeout=5)
 
         self.assertTrue(result)  # work was performed
         consumer._handle_job.assert_called_once_with("42", "1-0")
@@ -52,5 +55,50 @@ def test_recovery_does_not_fire_before_interval_elapses(self):
         consumer._recover_stuck_pending.assert_not_called()  # gate holds
 
 
+class TestXAutoClaimRecovery(unittest.TestCase):
+    """Recovery now reclaims idle PEL messages via XAUTOCLAIM and re-dispatches."""
+
+    def test_reclaims_idle_messages_and_redispatches(self):
+        consumer, client = _make_consumer()
+        consumer._handle_job = MagicMock()
+        # (next_cursor, claimed_messages, deleted_ids)
+        client.xautoclaim.return_value = (
+            "0-0",
+            [("1-0", {"job_id": "5"})],
+            [],
+        )
+
+        consumer._recover_stuck_pending("worker-1")
+        consumer._await_inflight(timeout=5)
+
+        # Claimed with the configured min-idle and bounded by free capacity.
+        kwargs = client.xautoclaim.call_args.kwargs
+        self.assertEqual(kwargs["min_idle_time"], consumer._recovery_min_idle_ms)
+        self.assertEqual(kwargs["consumername"], "worker-1")
+        self.assertEqual(kwargs["count"], 2)  # free capacity
+        # Reclaimed message dispatched through the pool.
+        consumer._handle_job.assert_called_once_with("5", "1-0")
+
+    def test_skips_when_no_free_capacity(self):
+        consumer, client = _make_consumer()
+        # Saturate in-flight so there is no capacity to reclaim into.
+        with consumer._inflight_lock:
+            consumer._inflight = consumer._max_workers
+
+        consumer._recover_stuck_pending("worker-1")
+
+        client.xautoclaim.assert_not_called()
+
+    def test_acks_tombstoned_entries(self):
+        consumer, client = _make_consumer()
+        # A claimed entry whose data was deleted from the stream (fields None).
+        client.xautoclaim.return_value = ("0-0", [("7-0", None)], [])
+
+        consumer._recover_stuck_pending("worker-1")
+        consumer._await_inflight(timeout=5)
+
+        client.xack.assert_called_once_with("media:jobs", "media-workers", "7-0")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/worker/tests/test_consumer_retry.py b/worker/tests/test_consumer_retry.py
index fcc281c..73dd1f2 100644
--- a/worker/tests/test_consumer_retry.py
+++ b/worker/tests/test_consumer_retry.py
@@ -9,6 +9,7 @@ def _make_consumer(max_retries=3):
     cfg = MagicMock()
     cfg.stream_name = "media:jobs"
     cfg.consumer_group = "media-workers"
+    cfg.max_concurrent_jobs = 1
     cfg.redis.max_retries = max_retries
     with patch("worker.consumer.consumer.redis.Redis.from_url") as from_url:
         from_url.return_value = MagicMock()
@@ -44,6 +45,9 @@ def test_retryable_exception_requeues_below_cap(self, mock_dispatch):
         sql = _executed_sql(cursor)
         self.assertIn("UPDATE jobs SET status = 'pending'", sql)
         self.assertNotIn("UPDATE assets SET status = 'failed'", sql)
+        # Retryable failure stays in the PEL: no DLQ, no ack.
+        consumer.redis.xadd.assert_not_called()
+        consumer.redis.xack.assert_not_called()
 
     @patch("worker.consumer.consumer.process_asset_dispatch")
     def test_non_retryable_exception_fails_immediately(self, mock_dispatch):
@@ -56,6 +60,9 @@ def test_non_retryable_exception_fails_immediately(self, mock_dispatch):
         # Fails now despite attempts (0) being below the cap.
         self.assertIn("UPDATE assets SET status = 'failed'", sql)
         self.assertNotIn("UPDATE jobs SET status = 'pending'", sql)
+        # Poison message is dead-lettered and the original acked.
+        consumer.redis.xadd.assert_called_once()
+        consumer.redis.xack.assert_called_once_with("media:jobs", "media-workers", "1-0")
 
 
 if __name__ == "__main__":
diff --git a/worker/tests/test_consumer_tracing.py b/worker/tests/test_consumer_tracing.py
new file mode 100644
index 0000000..e88b949
--- /dev/null
+++ b/worker/tests/test_consumer_tracing.py
@@ -0,0 +1,113 @@
+import unittest
+from unittest.mock import MagicMock, patch
+
+from opentelemetry.propagate import set_global_textmap
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+from opentelemetry.trace.propagation.tracecontext import (
+    TraceContextTextMapPropagator,
+)
+
+from worker.consumer.consumer import Consumer
+
+
+TRACE_ID_HEX = "0af7651916cd43dd8448eb211c80319c"
+SPAN_ID_HEX = "b7ad6b7169203331"
+TRACEPARENT = f"00-{TRACE_ID_HEX}-{SPAN_ID_HEX}-01"
+
+
+def _make_consumer():
+    cfg = MagicMock()
+    cfg.stream_name = "media:jobs"
+    cfg.consumer_group = "media-workers"
+    cfg.max_concurrent_jobs = 2
+    with patch("worker.consumer.consumer.redis.Redis.from_url") as from_url:
+        from_url.return_value = MagicMock()
+        consumer = Consumer(
+            pg_pool=MagicMock(), redis_url="redis://x", storage=MagicMock(), cfg=cfg
+        )
+    return consumer, from_url.return_value
+
+
+class TestConsumeSpanPropagation(unittest.TestCase):
+    """Phase 1d: the worker continues the producer trace across the queue."""
+
+    def setUp(self):
+        set_global_textmap(TraceContextTextMapPropagator())
+        self.exporter = InMemorySpanExporter()
+        provider = TracerProvider()
+        provider.add_span_processor(SimpleSpanProcessor(self.exporter))
+        self.tracer = provider.get_tracer("test")
+
+    def _run_consume_with(self, fields):
+        consumer, client = _make_consumer()
+        client.xreadgroup.return_value = [("media:jobs", [("1-0", fields)])]
+        consumer._recover_stuck_pending = MagicMock()
+        consumer._handle_job = MagicMock()
+        consumer._handle_asset_message = MagicMock()
+        with patch(
+            "worker.consumer.consumer.get_tracer", return_value=self.tracer
+        ):
+            consumer.consume("worker-1")
+            # Dispatch is now async (thread pool); wait for the task to finish
+            # while the tracer patch is still active so the span is captured.
+            consumer._await_inflight(timeout=5)
+        return consumer
+
+    def test_consume_span_is_child_and_linked_to_producer(self):
+        consumer = self._run_consume_with({"job_id": "42", "traceparent": TRACEPARENT})
+        consumer._handle_job.assert_called_once_with("42", "1-0")
+
+        spans = self.exporter.get_finished_spans()
+        consume = next(s for s in spans if s.name == "worker.consume")
+
+        expected_trace_id = int(TRACE_ID_HEX, 16)
+        expected_span_id = int(SPAN_ID_HEX, 16)
+
+        # Child: parent is the producer context, and the span continues the trace.
+        self.assertIsNotNone(consume.parent)
+        self.assertEqual(consume.parent.trace_id, expected_trace_id)
+        self.assertEqual(consume.parent.span_id, expected_span_id)
+        self.assertEqual(consume.context.trace_id, expected_trace_id)
+
+        # Link: queue fan-in primitive points at the same producer context.
+        self.assertTrue(consume.links)
+        self.assertEqual(consume.links[0].context.trace_id, expected_trace_id)
+
+    def test_consume_without_traceparent_starts_new_trace(self):
+        consumer = self._run_consume_with({"job_id": "42"})
+        consumer._handle_job.assert_called_once_with("42", "1-0")
+        spans = self.exporter.get_finished_spans()
+        consume = next(s for s in spans if s.name == "worker.consume")
+        # No producer context -> root span, no link.
+        self.assertIsNone(consume.parent)
+        self.assertFalse(consume.links)
+
+    def test_traceparent_merged_from_body(self):
+        import json
+
+        body = json.dumps({"job_id": "42"})
+        consumer = self._run_consume_with({"body": body, "traceparent": TRACEPARENT})
+        consumer._handle_job.assert_called_once_with("42", "1-0")
+        spans = self.exporter.get_finished_spans()
+        consume = next(s for s in spans if s.name == "worker.consume")
+        self.assertEqual(consume.parent.trace_id, int(TRACE_ID_HEX, 16))
+
+
+class TestTracerInit(unittest.TestCase):
+    def test_init_tracing_sets_tracer(self):
+        import worker.utils.tracing as tracing
+
+        # Reset module state for a clean init.
+        tracing._tracer = None
+        tracing._provider = None
+        tracing.init_tracing(endpoint="otel-collector:4317", deployment_env="local")
+        self.assertIsNotNone(tracing.get_tracer())
+        tracing.shutdown_tracing()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/worker/tests/test_db_pool.py b/worker/tests/test_db_pool.py
new file mode 100644
index 0000000..3a10e61
--- /dev/null
+++ b/worker/tests/test_db_pool.py
@@ -0,0 +1,40 @@
+import unittest
+from unittest.mock import patch
+
+from worker.consumer.db import PgPool
+
+
+class TestPgPoolSizing(unittest.TestCase):
+    @patch("worker.consumer.db.ConnectionPool")
+    def test_honours_configured_max_size(self, mock_pool):
+        PgPool(dsn="postgresql://x/y", max_size=7)
+        _, kwargs = mock_pool.call_args
+        self.assertEqual(kwargs["max_size"], 7)
+        # min_size stays at the psycopg default cap but never exceeds max_size.
+        self.assertEqual(kwargs["min_size"], 4)
+
+    @patch("worker.consumer.db.ConnectionPool")
+    def test_defaults_to_ten(self, mock_pool):
+        PgPool(dsn="postgresql://x/y")
+        _, kwargs = mock_pool.call_args
+        self.assertEqual(kwargs["max_size"], 10)
+
+    @patch("worker.consumer.db.ConnectionPool")
+    def test_clamps_to_at_least_one(self, mock_pool):
+        PgPool(dsn="postgresql://x/y", max_size=0)
+        _, kwargs = mock_pool.call_args
+        self.assertEqual(kwargs["max_size"], 1)
+
+    @patch("worker.consumer.db.ConnectionPool")
+    def test_min_size_clamped_under_small_max(self, mock_pool):
+        # Small pool (e.g. MAX_CONCURRENT_JOBS=1 -> size 3) must not have
+        # min_size(4) > max_size, which psycopg rejects.
+        PgPool(dsn="postgresql://x/y", max_size=3)
+        _, kwargs = mock_pool.call_args
+        self.assertEqual(kwargs["max_size"], 3)
+        self.assertEqual(kwargs["min_size"], 3)
+        self.assertLessEqual(kwargs["min_size"], kwargs["max_size"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/worker/tests/test_image_pipeline.py b/worker/tests/test_image_pipeline.py
index 074c6b7..b74f08a 100644
--- a/worker/tests/test_image_pipeline.py
+++ b/worker/tests/test_image_pipeline.py
@@ -66,6 +66,7 @@ def test_process_image_file(self, mock_image_open):
         # ---------------------
         process_image_file(
             asset_id="test-123",
+            owner_id="tenant-1",
             local_raw_path="dummy.jpg",
             content_hash="deadbeef",
             pg_pool=mock_pg_pool,
@@ -86,7 +87,7 @@ def test_process_image_file(self, mock_image_open):
         # 3. Storage upload was called for all 3 variants
         self.assertEqual(len(storage.calls), 3)
         for key, size, content_type in storage.calls:
-            self.assertIn("media/processed/test-123/", key)
+            self.assertIn("media/tenant-1/processed/test-123/", key)
             self.assertTrue(size > 0)
             self.assertIn("image/", content_type)
 
diff --git a/worker/tests/test_logging_correlation.py b/worker/tests/test_logging_correlation.py
new file mode 100644
index 0000000..6962fcc
--- /dev/null
+++ b/worker/tests/test_logging_correlation.py
@@ -0,0 +1,38 @@
+import logging
+import unittest
+
+from opentelemetry.sdk.trace import TracerProvider
+
+from worker.utils.logger import TraceContextFilter
+
+
+class TestTraceContextFilter(unittest.TestCase):
+    """Phase 2b: log records carry the active span's trace_id/span_id."""
+
+    def _record(self):
+        return logging.LogRecord(
+            name="t", level=logging.INFO, pathname=__file__, lineno=1,
+            msg="hello", args=(), exc_info=None,
+        )
+
+    def test_no_span_emits_empty(self):
+        f = TraceContextFilter()
+        rec = self._record()
+        f.filter(rec)
+        self.assertEqual(rec.trace_id, "")
+        self.assertEqual(rec.span_id, "")
+
+    def test_active_span_stamps_ids(self):
+        provider = TracerProvider()
+        tracer = provider.get_tracer("test")
+        f = TraceContextFilter()
+        with tracer.start_as_current_span("s"):
+            rec = self._record()
+            f.filter(rec)
+            self.assertEqual(len(rec.trace_id), 32)
+            self.assertEqual(len(rec.span_id), 16)
+            self.assertRegex(rec.trace_id, r"^[0-9a-f]{32}$")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/worker/tests/test_migrations.py b/worker/tests/test_migrations.py
new file mode 100644
index 0000000..3f0cb34
--- /dev/null
+++ b/worker/tests/test_migrations.py
@@ -0,0 +1,89 @@
+"""
+Tests for the Python migration runner.
+
+These tests use mocks for ``psycopg.connect`` so they can run without a live
+PostgreSQL instance. The destructive-migration gate must short-circuit before
+opening any database connection, so a refusal test can use an obviously
+invalid DSN.
+"""
+
+from unittest import mock
+
+import pytest
+
+from worker.consumer.migrations import run_migrations
+
+
+def _write_migration(tmp_path, name: str, body: str = "-- stub") -> None:
+    (tmp_path / name).write_text(body)
+
+
+def test_run_migrations_refuses_destructive_when_disabled(tmp_path):
+    """Versions 7 and 8 must be refused unless allow_destructive=True.
+
+    The check must run against the file-system pending list before any
+    database connection is opened, so an obviously invalid DSN is fine and
+    psycopg.connect must never be invoked.
+    """
+    _write_migration(tmp_path, "000001_init.up.sql")
+    _write_migration(tmp_path, "000006_api_keys.up.sql")
+    _write_migration(tmp_path, "000007_split_webhook_key.up.sql")
+    _write_migration(tmp_path, "000008_assets_owner_not_null.up.sql")
+
+    with mock.patch("worker.consumer.migrations.psycopg.connect") as mock_connect:
+        with pytest.raises(RuntimeError, match="destructive migrations"):
+            run_migrations(
+                dsn="postgresql://invalid:invalid@127.0.0.1:1/invalid",
+                migrations_dir=str(tmp_path),
+                allow_destructive=False,
+            )
+
+    mock_connect.assert_not_called()
+
+
+def test_run_migrations_refuses_when_only_destructive_is_pending(tmp_path):
+    """A single pending destructive version is enough to abort."""
+    _write_migration(tmp_path, "000007_split_webhook_key.up.sql")
+
+    with mock.patch("worker.consumer.migrations.psycopg.connect") as mock_connect:
+        with pytest.raises(RuntimeError, match=r"\['000007'\]"):
+            run_migrations(
+                dsn="postgresql://invalid",
+                migrations_dir=str(tmp_path),
+                allow_destructive=False,
+            )
+
+    mock_connect.assert_not_called()
+
+
+def test_run_migrations_allows_destructive_with_flag(tmp_path, monkeypatch):
+    """With allow_destructive=True, the runner proceeds past the gate.
+
+    A fully mocked connection is used so the test does not need a real
+    PostgreSQL server. The runner only needs to make it through the gate;
+    per-migration application is covered by the integration smoke test.
+    """
+    monkeypatch.delenv("MIGRATION_ALLOW_DESTRUCTIVE", raising=False)
+    _write_migration(tmp_path, "000007_split_webhook_key.up.sql")
+    _write_migration(tmp_path, "000008_assets_owner_not_null.up.sql")
+
+    # `with psycopg.connect(dsn) as conn:` resolves to:
+    #   ctx = psycopg.connect(dsn); conn = ctx.__enter__()
+    # so the runner operates on mock_connect.return_value.__enter__().return_value.
+    # Pre-mark every pending version as already applied so no SQL executes
+    # and the destructive gate is the only thing under test.
+    mock_connect = mock.MagicMock()
+    mock_conn = mock_connect.return_value.__enter__.return_value
+    mock_cursor = mock_conn.cursor.return_value.__enter__.return_value
+    mock_cursor.fetchone.return_value = (1,)
+
+    with mock.patch(
+        "worker.consumer.migrations.psycopg.connect", mock_connect
+    ):
+        run_migrations(
+            dsn="postgresql://stub",
+            migrations_dir=str(tmp_path),
+            allow_destructive=True,
+        )
+
+    mock_conn.cursor.assert_called()
diff --git a/worker/tests/test_pipeline_spans.py b/worker/tests/test_pipeline_spans.py
new file mode 100644
index 0000000..7aeb9f0
--- /dev/null
+++ b/worker/tests/test_pipeline_spans.py
@@ -0,0 +1,77 @@
+import unittest
+from unittest.mock import MagicMock, patch
+
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+from opentelemetry import trace
+
+import worker.processing.processor as processor
+
+
+class TestPipelineStageSpans(unittest.TestCase):
+    """Phase 2a: dispatch emits download + dedup (+ delegate) spans."""
+
+    def setUp(self):
+        self.exporter = InMemorySpanExporter()
+        provider = TracerProvider()
+        provider.add_span_processor(SimpleSpanProcessor(self.exporter))
+        # Point the module-level proxy tracer at our in-memory provider.
+        self._tracer = provider.get_tracer("test")
+        self._orig = processor.tracer
+        processor.tracer = self._tracer
+
+    def tearDown(self):
+        processor.tracer = self._orig
+
+    def _pg_pool_returning(self, asset_row):
+        cursor = MagicMock()
+        cursor.fetchone.return_value = asset_row
+        conn = MagicMock()
+        conn.cursor.return_value = cursor
+        pg = MagicMock()
+        pg.get_pg_conn.return_value.__enter__.return_value = conn
+        return pg
+
+    @patch("worker.processing.processor.get_extension_for_mime", return_value="jpg")
+    @patch("worker.processing.processor.compute_file_hash", return_value="")
+    @patch("worker.processing.processor.process_image_file")
+    @patch("worker.processing.processor.os.path.exists", return_value=False)
+    def test_image_dispatch_emits_stage_spans(
+        self, _exists, mock_img, _hash, _ext
+    ):
+        asset_row = ("a1", "image", "uploaded", "u", "image/jpeg", None, "tenant-1")
+        pg = self._pg_pool_returning(asset_row)
+        storage = MagicMock()
+        cfg = MagicMock()
+        cfg.temp_dir = "/tmp"
+
+        processor.process_asset_dispatch("a1", pg, storage, cfg)
+
+        names = {s.name for s in self.exporter.get_finished_spans()}
+        self.assertIn("process.dispatch", names)
+        self.assertIn("process.download", names)
+        mock_img.assert_called_once()
+
+    @patch("worker.processing.processor.get_extension_for_mime", return_value="jpg")
+    @patch("worker.processing.processor.compute_file_hash", return_value="abc123")
+    @patch("worker.processing.processor.check_for_duplicate")
+    @patch("worker.processing.processor.process_image_file")
+    @patch("worker.processing.processor.os.path.exists", return_value=False)
+    def test_dedup_span_emitted_when_hash_present(
+        self, _exists, mock_img, mock_dedup, _hash, _ext
+    ):
+        mock_dedup.return_value = processor.DedupResult.NO_DUPLICATE
+        asset_row = ("a1", "image", "uploaded", "u", "image/jpeg", None, "tenant-1")
+        pg = self._pg_pool_returning(asset_row)
+
+        processor.process_asset_dispatch("a1", pg, MagicMock(), MagicMock(temp_dir="/tmp"))
+
+        names = {s.name for s in self.exporter.get_finished_spans()}
+        self.assertIn("process.dedup_check", names)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/worker/tests/test_processor_dispatch.py b/worker/tests/test_processor_dispatch.py
index 76cec9a..899bd24 100644
--- a/worker/tests/test_processor_dispatch.py
+++ b/worker/tests/test_processor_dispatch.py
@@ -29,8 +29,8 @@ def test_processing_failure_leaves_asset_status_untouched(
     ):
         mock_process_image.side_effect = RuntimeError("boom")
 
-        # (asset_id, type, status, original_url, mime_type, content_hash)
-        asset_row = ("asset-1", "image", "uploaded", "gs://raw/asset-1", "image/jpeg", None)
+        # (asset_id, type, status, original_url, mime_type, content_hash, owner_id)
+        asset_row = ("asset-1", "image", "uploaded", "gs://raw/asset-1", "image/jpeg", None, "tenant-1")
         pg_pool, cursor = self._make_pg_pool(asset_row)
         storage = MagicMock()
         cfg = MagicMock()
diff --git a/worker/tests/test_s3_public_endpoint.py b/worker/tests/test_s3_public_endpoint.py
index 2908cde..853d1cf 100644
--- a/worker/tests/test_s3_public_endpoint.py
+++ b/worker/tests/test_s3_public_endpoint.py
@@ -22,17 +22,17 @@ def _make(self, endpoint_url, public_endpoint_url):
 
     def test_public_url_uses_public_endpoint(self):
         st = self._make("http://minio:9000", "http://localhost:9000")
-        url = st.public_url("media/processed/abc/thumbnail.webp")
+        url = st.public_url("media/tenant-abc/processed/abc/thumbnail.webp")
         self.assertEqual(
-            url, "http://localhost:9000/mpiper/media/processed/abc/thumbnail.webp"
+            url, "http://localhost:9000/mpiper/media/tenant-abc/processed/abc/thumbnail.webp"
         )
         # Internal I/O still targets the private endpoint.
         self.assertEqual(st.client.meta.endpoint_url, "http://minio:9000")
 
     def test_public_url_falls_back_to_internal_when_unset(self):
         st = self._make("http://minio:9000", None)
-        url = st.public_url("media/raw/xyz")
-        self.assertEqual(url, "http://minio:9000/mpiper/media/raw/xyz")
+        url = st.public_url("media/tenant-xyz/raw/xyz")
+        self.assertEqual(url, "http://minio:9000/mpiper/media/tenant-xyz/raw/xyz")
 
     def test_bucket_config_defaults_public_to_internal(self, ):
         import os
diff --git a/worker/utils/logger.py b/worker/utils/logger.py
index bbc642c..6b2122b 100644
--- a/worker/utils/logger.py
+++ b/worker/utils/logger.py
@@ -2,8 +2,33 @@
 import os
 from typing import Optional
 
+from opentelemetry import trace
 
-_DEFAULT_FORMAT = "%(asctime)s %(levelname)s [%(name)s] %(message)s"
+
+# trace_id=<hex> matches the Grafana Loki derived-field regex (trace_id=(\w+)),
+# which links each log line to its Tempo trace. span_id is included for context.
+_DEFAULT_FORMAT = (
+    "%(asctime)s %(levelname)s [%(name)s] "
+    "trace_id=%(trace_id)s span_id=%(span_id)s %(message)s"
+)
+
+
+class TraceContextFilter(logging.Filter):
+    """Inject the active span's trace_id/span_id into every log record.
+
+    Emits empty strings when there is no active recording span, so the Grafana
+    derived field does not create a link to a non-existent trace.
+    """
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        ctx = trace.get_current_span().get_span_context()
+        if ctx is not None and ctx.is_valid:
+            record.trace_id = format(ctx.trace_id, "032x")
+            record.span_id = format(ctx.span_id, "016x")
+        else:
+            record.trace_id = ""
+            record.span_id = ""
+        return True
 
 
 def setup_logging(
@@ -35,6 +60,12 @@ def setup_logging(
         format=fmt,
     )
 
+    # Attach the trace-context filter at the handler level so it stamps every
+    # record flowing through, regardless of which logger emitted it.
+    trace_filter = TraceContextFilter()
+    for handler in logging.getLogger().handlers:
+        handler.addFilter(trace_filter)
+
     # Silence noisy libraries (optional, but recommended)
     logging.getLogger("urllib3").setLevel(logging.WARNING)
     logging.getLogger("botocore").setLevel(logging.WARNING)
diff --git a/worker/utils/metrics.py b/worker/utils/metrics.py
index 82af1be..75ce61d 100644
--- a/worker/utils/metrics.py
+++ b/worker/utils/metrics.py
@@ -18,6 +18,10 @@
 from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
 from opentelemetry.sdk.metrics import MeterProvider
 from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+from opentelemetry.sdk.metrics.view import (
+    ExplicitBucketHistogramAggregation,
+    View,
+)
 from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION, DEPLOYMENT_ENVIRONMENT, SERVICE_INSTANCE_ID
 
 from worker.utils.logger import get_logger
@@ -97,8 +101,27 @@ def init_metrics(
     # Create metric reader with 15-second export interval
     reader = PeriodicExportingMetricReader(exporter, export_interval_millis=15000)
 
+    # Finer histogram buckets for duration metrics. The SDK default buckets are
+    # too coarse for sub-second work (everything lands in [0,5) so p95 reads
+    # ~4.75s), which makes the image/job latency SLIs meaningless. These cover
+    # tens-of-ms (images) through tens-of-seconds (video transcode).
+    _duration_buckets = [0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300]
+    duration_views = [
+        View(
+            instrument_name=name,
+            aggregation=ExplicitBucketHistogramAggregation(_duration_buckets),
+        )
+        for name in (
+            "mpiper.asset.processing.duration",
+            "mpiper.job.processing.duration",
+            "mpiper.queue.processing.duration",
+        )
+    ]
+
     # Create meter provider
-    provider = MeterProvider(resource=resource, metric_readers=[reader])
+    provider = MeterProvider(
+        resource=resource, metric_readers=[reader], views=duration_views
+    )
     metrics.set_meter_provider(provider)
 
     # Get meter
@@ -220,9 +243,43 @@ def init_metrics(
     logger.info("OpenTelemetry metrics initialized successfully")
 
 
+def record_consume() -> None:
+    """Count one consumed queue message (no-op until init_metrics runs)."""
+    if queue_message_consumed is not None:
+        queue_message_consumed.add(1)
+
+
+def record_job(success: bool, duration_seconds: float) -> None:
+    """Record job-level outcome + duration. Safe before init (no-op)."""
+    if job_processing_total is not None:
+        job_processing_total.add(1)
+    if success and job_processing_success is not None:
+        job_processing_success.add(1)
+    if (not success) and job_processing_failed is not None:
+        job_processing_failed.add(1)
+    if job_processing_duration is not None:
+        job_processing_duration.record(duration_seconds)
+
+
+def record_asset(asset_type: str, duration_seconds: float, success: bool) -> None:
+    """Record asset processing outcome + duration, labelled by type only.
+
+    asset_type is low-cardinality (image/video); asset_id must never be a label.
+    """
+    attrs = {"asset_type": asset_type or "unknown"}
+    if asset_processing_total is not None:
+        asset_processing_total.add(1, attrs)
+    if success and asset_processing_success is not None:
+        asset_processing_success.add(1, attrs)
+    if (not success) and asset_processing_failed is not None:
+        asset_processing_failed.add(1, attrs)
+    if asset_processing_duration is not None:
+        asset_processing_duration.record(duration_seconds, attrs)
+
+
 def get_meter() -> Optional[metrics.Meter]:
     """Get the global meter instance.
-    
+
     Returns
     -------
     Optional[metrics.Meter]
@@ -231,6 +288,32 @@ def get_meter() -> Optional[metrics.Meter]:
     return _meter
 
 
+def register_dlq_depth_gauge(get_depth) -> None:
+    """Register an observable gauge reporting the dead-letter stream length.
+
+    `get_depth` is a zero-arg callable returning the current DLQ depth (e.g.
+    `lambda: redis.xlen(dlq_stream)`). The gauge is observed at each metric
+    export; if telemetry was not initialised (meter is None) this is a no-op so
+    the worker keeps running. Failures in the callback are swallowed so a Redis
+    blip never breaks metric export.
+    """
+    if _meter is None:
+        return
+
+    def _observe(_options):
+        try:
+            return [metrics.Observation(int(get_depth()))]
+        except Exception:  # pragma: no cover - defensive: never break export
+            return []
+
+    _meter.create_observable_gauge(
+        name="mpiper.dlq.depth",
+        callbacks=[_observe],
+        description="Current number of messages in the dead-letter stream",
+        unit="{message}",
+    )
+
+
 def shutdown_metrics() -> None:
     """Shutdown the metrics provider and flush all pending metrics."""
     provider = metrics.get_meter_provider()
diff --git a/worker/utils/tracing.py b/worker/utils/tracing.py
new file mode 100644
index 0000000..c2cadf5
--- /dev/null
+++ b/worker/utils/tracing.py
@@ -0,0 +1,121 @@
+"""
+worker.utils.tracing
+
+OpenTelemetry tracing initialization for the Python worker.
+
+Mirrors `worker.utils.metrics`: an OTLP gRPC exporter to the same collector
+endpoint, a BatchSpanProcessor, and the SAME W3C propagators as the Go API
+(`traceparent` + `baggage`) so the trace continues across the Redis boundary
+instead of starting fresh.
+
+The worker had OTel metric instruments but no tracer and no context extraction,
+so the distributed trace died at the queue. This closes that gap on the consumer
+side; `worker.consumer.consumer` extracts the producer context and starts the
+consume span as a child (with a link) of it.
+"""
+
+from typing import Optional
+
+from opentelemetry import trace
+from opentelemetry.baggage.propagation import W3CBaggagePropagator
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.propagate import set_global_textmap
+from opentelemetry.propagators.composite import CompositePropagator
+from opentelemetry.sdk.resources import (
+    DEPLOYMENT_ENVIRONMENT,
+    SERVICE_INSTANCE_ID,
+    SERVICE_NAME,
+    SERVICE_VERSION,
+    Resource,
+)
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.trace.sampling import ALWAYS_ON, ParentBased, TraceIdRatioBased
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+
+from worker.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+# Global tracer for the worker
+_tracer: Optional[trace.Tracer] = None
+_provider: Optional[TracerProvider] = None
+
+
+def _build_sampler(deployment_env: str, sampling_rate: float):
+    """AlwaysSample in dev/local; parent-based ratio sampling otherwise.
+
+    Matches the Go API's getSampler() so both services agree on what to keep.
+    """
+    if deployment_env in ("development", "dev", "local", ""):
+        return ALWAYS_ON
+    return ParentBased(root=TraceIdRatioBased(sampling_rate))
+
+
+def init_tracing(
+    service_name: str = "mpiper-worker",
+    service_version: str = "dev",
+    endpoint: str = "otel-collector:4317",
+    deployment_env: str = "development",
+    instance_id: Optional[str] = None,
+    tls_insecure: bool = True,
+    sampling_rate: float = 1.0,
+) -> None:
+    """Initialize OpenTelemetry tracing with an OTLP gRPC span exporter.
+
+    Parameters should be sourced from the centralised config (get_config().otel).
+    Idempotent: a second call is a no-op so the worker can call it safely on
+    startup alongside init_metrics.
+    """
+    global _tracer, _provider
+
+    if _tracer is not None:
+        logger.warning("Tracing already initialized")
+        return
+
+    if "://" in endpoint:
+        endpoint = endpoint.split("://", 1)[1]
+
+    logger.info(f"Initializing OpenTelemetry tracer with endpoint: {endpoint}")
+
+    resource = Resource.create(
+        {
+            SERVICE_NAME: service_name,
+            SERVICE_VERSION: service_version,
+            DEPLOYMENT_ENVIRONMENT: deployment_env,
+            SERVICE_INSTANCE_ID: instance_id or service_name,
+        }
+    )
+
+    exporter = OTLPSpanExporter(endpoint=endpoint, insecure=tls_insecure)
+
+    provider = TracerProvider(
+        resource=resource,
+        sampler=_build_sampler(deployment_env, sampling_rate),
+    )
+    provider.add_span_processor(BatchSpanProcessor(exporter))
+    trace.set_tracer_provider(provider)
+
+    # Same propagators as the Go API (composite TraceContext + Baggage) so the
+    # traceparent the producer injected is understood here.
+    set_global_textmap(
+        CompositePropagator(
+            [TraceContextTextMapPropagator(), W3CBaggagePropagator()]
+        )
+    )
+
+    _provider = provider
+    _tracer = trace.get_tracer(__name__)
+    logger.info("OpenTelemetry tracer initialized successfully")
+
+
+def get_tracer() -> Optional[trace.Tracer]:
+    """Return the global worker tracer, or None if init_tracing was not called."""
+    return _tracer
+
+
+def shutdown_tracing() -> None:
+    """Flush and shut down the tracer provider on exit."""
+    if _provider is not None:
+        _provider.shutdown()
+        logger.info("Tracer provider shutdown complete")