diff --git a/cmd/server/main.go b/cmd/server/main.go
index bb4f886..9a2a2a2 100644
--- a/cmd/server/main.go
+++ b/cmd/server/main.go
@@ -2,6 +2,7 @@ package main
 
 import (
 	"context"
+	"database/sql"
 	"errors"
 	"fmt"
 	"net/http"
@@ -125,6 +126,13 @@ func main() {
 	_ = m.RegisterOutboxPendingFunc(func(ctx context.Context) (int64, error) {
 		return outboxRepo.CountPending(ctx)
 	})
+
+	// Observe the database connection-pool stats (in-use / idle / open / max /
+	// wait count). sqlx.DB embeds *sql.DB, so db.Stats() exposes pool saturation
+	// — the key signal for whether the DB pool is a bottleneck under load.
+	_ = m.RegisterDBStatsFunc(func() sql.DBStats {
+		return db.Stats()
+	})
 	go relay.Start(serverCtx)
 	go relay.StartCleanup(serverCtx, cfg.Outbox.Retention)
 
diff --git a/docker-compose.loadtest.yml b/docker-compose.loadtest.yml
new file mode 100644
index 0000000..55fadc9
--- /dev/null
+++ b/docker-compose.loadtest.yml
@@ -0,0 +1,54 @@
+# ============================================================================
+# MPiper — Load-test overlay (Track 3, Phase 0)
+#
+#   docker compose \
+#     -f docker-compose.yml \
+#     -f docker-compose.observability.yml \
+#     -f docker-compose.loadtest.yml \
+#     up -d --build
+#
+# Why this exists
+# ---------------
+# Local load-test results are only interpretable when resources are PINNED, so
+# "the worker is the bottleneck" is a stable, observable fact rather than
+# something that moves run to run depending on spare laptop cores. This overlay:
+#
+#   1. Caps CPU/memory on `api` and `worker` via `deploy.resources.limits`
+#      (honoured by `docker compose up` in Compose v2 — no swarm needed).
+#   2. Forces TRACE_SAMPLING_RATE=1.0 so every asset produces a full trace.
+#
+# The worker is intentionally pinned to a single CPU: it is single-threaded
+# today (Track 1), and pinning makes that saturation visible and reproducible.
+# Document the limits alongside every experiment — changing them changes ffmpeg
+# thread behaviour, which is the point, but it must be recorded for comparison.
+# ============================================================================
+
+name: mpiper
+
+services:
+  api:
+    environment:
+      # Full sampling locally — never lose a trace to the 0.1 code default.
+      TRACE_SAMPLING_RATE: "1.0"
+    deploy:
+      resources:
+        limits:
+          cpus: "1.0"
+          memory: 512M
+        reservations:
+          cpus: "0.25"
+          memory: 128M
+
+  worker:
+    environment:
+      # Full sampling locally; matches the API so the whole trace is captured.
+      TRACE_SAMPLING_RATE: "1.0"
+    deploy:
+      resources:
+        limits:
+          # Single CPU on purpose — surfaces the single-threaded bottleneck.
+          cpus: "1.0"
+          memory: 1024M
+        reservations:
+          cpus: "0.5"
+          memory: 256M
diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml
index 65d4760..7ff8300 100644
--- a/docker-compose.observability.yml
+++ b/docker-compose.observability.yml
@@ -21,7 +21,7 @@ services:
   # Grafana Tempo - Distributed Tracing Backend
   # ==========================================================================
   tempo:
-    image: grafana/tempo:latest
+    image: grafana/tempo:2.6.1
     container_name: mpiper-tempo
     command: ["-config.file=/etc/tempo.yaml"]
     volumes:
@@ -102,8 +102,14 @@ services:
       - '--web.console.libraries=/usr/share/prometheus/console_libraries'
       - '--web.console.templates=/usr/share/prometheus/consoles'
       - '--web.enable-lifecycle'
+      # Accept k6's Prometheus remote-write output (Track 3, Phase 4) so client-
+      # side load metrics land in the same Prometheus as the server-side ones.
+      - '--web.enable-remote-write-receiver'
+      # Exemplar storage powers the histogram-bucket -> Tempo trace links.
+      - '--enable-feature=exemplar-storage'
     volumes:
       - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./observability/prometheus.rules.yml:/etc/prometheus/prometheus.rules.yml
       - prometheus-data:/prometheus
     ports:
       - "9090:9090"
diff --git a/docs/enhancements/README.md b/docs/enhancements/README.md
new file mode 100644
index 0000000..9e79164
--- /dev/null
+++ b/docs/enhancements/README.md
@@ -0,0 +1,204 @@
+# MPiper Enhancements — Roadmap
+
+This directory tracks the work that takes MPiper from a well-built side project
+to a production-grade media platform. Each **track** is chosen to teach a
+distinct, transferable systems-engineering concept *and* to add real product
+value — not feature-padding.
+
+The philosophy: **write a design doc per track before coding** (problem, options,
+decision, tradeoffs, how success is measured), and pair every track with a
+load test or chaos experiment so each claim ("now it scales", "now it's
+exactly-once") is *demonstrated*, not assumed.
+
+> **Progress:** Track 3 (observability + load testing) is **done** ✅. It shipped
+> the foundation that makes everything below *measurable* — end-to-end tracing,
+> SLOs, Grafana dashboards, and a k6 load harness. The first load test
+> ([`experiments/0001-worker-saturation.md`](../../experiments/0001-worker-saturation.md))
+> already re-ordered this roadmap with hard data instead of hunches.
+
+## Where we are today
+
+A clean, correct, **single-tenant, best-effort, single-node-throughput** pipeline
+with good bones — now fully observable:
+
+- Transactional enqueue via an **outbox relay** (Postgres → Redis Streams).
+- An **idempotent-ish consumer** with content-hash dedup.
+- **Presigned uploads** with a split internal/public storage endpoint.
+- **Webhooks** with HMAC signing + exponential backoff.
+- **End-to-end distributed tracing** (API → outbox → Redis → worker → ffmpeg, one
+  waterfall per asset), **OTel metrics** on API and worker, **SLO recording rules**,
+  provisioned **Grafana dashboards**, and a host-run **k6 load harness** — on the
+  bundled Grafana/Tempo/Loki/Prometheus stack.
+
+Known seams where "side project" becomes "system" (verified in code, and now
+several of them **measured** under load):
+
+- **Single-threaded worker** — `MAX_CONCURRENT_JOBS` exists in config but is never
+  used; `consume()` pulls one message (`count=1`) and processes it inline.
+  *Measured:* μ ≈ **1.1 jobs/s**, worker CPU **98%**, queue depth → **2,544**. The
+  hard throughput ceiling and the confirmed **#1 bottleneck**.
+- **Webhook dispatcher can't keep up** — a 2s poll × batch-50 loop delivering
+  webhooks with synchronous HTTP + retries. *Measured:* `webhook_pending` peaked at
+  **~5,900** and never drained. A second, independent bottleneck.
+- **Homegrown recovery** — a 2-min DB-scan + re-`XADD`, not Redis Streams'
+  `XPENDING`/`XAUTOCLAIM` consumer-group recovery; poison messages are marked
+  `failed` and dropped (no dead-letter stream).
+- **No raw-upload lifecycle** — objects in `media/raw/` are never deleted.
+  *Measured:* ~**50%** of presigned uploads are never completed → orphaned objects
+  accumulate.
+- **Homegrown auth** — an AES-GCM token with no expiry/rotation, and the same
+  `ENCRYPTION_KEY` signs both auth tokens and webhook secrets.
+- **Polled high-churn tables** (`jobs`, `event_outbox`, `webhook_deliveries`), grown
+  unbounded with cleanup-by-retention only. *Measured:* `event_outbox` kept up with
+  **0 backlog** and the DB had headroom (**18 ms** mean query, **5/25** connections);
+  only `webhook_deliveries` actually strained.
+
+## What the first load test proved (exp 0001)
+
+Track 3 gave us the instrumentation to stop guessing. The first saturating run
+(`open --rate 10/s`, CPU-pinned worker) turned the seams above into a **measured,
+ranked** list — and every track below now has a baseline to beat by re-running the
+*same* k6 profile and comparing the dashboards.
+
+| Finding (measured) | What it means | Owner |
+|---|---|---|
+| Worker μ ≈ **1.1 jobs/s**, CPU 98%, queue → **2,544** | Single-threaded worker is the throughput ceiling | **Track 1 (P0)** |
+| `webhook_pending` → **5,901**, never drains | Dispatcher delivery rate ≪ insertion rate | **Track 1b (P1, new)** |
+| `event_outbox` **0 backlog**; DB **18 ms** mean, **5/25** conns | Outbox + DB have large headroom *today* | Track 7 → **defer** |
+| `webhook_deliveries` is the one polled table straining | The real, current trigger for data-layer work | Track 7 → **rescope to this** |
+| **~50%** of presigns never completed → orphaned `media/raw/` | Storage grows with abandoned uploads | Track 5 (small) |
+| `/complete` p99 **358 ms** (synchronous MinIO HEAD) | Minor hot-path tail | Track 5 (small) |
+
+Net effect: **Track 1 is confirmed P0**, a **webhook-throughput bottleneck was
+surfaced that no track owned** (now Track 1b), and **Track 7's table-partitioning is
+premature** — the DB isn't the problem yet; the webhook *delivery loop* is.
+
+## Tracks
+
+| # | Track | Core systems lesson | Status |
+|---|-------|---------------------|--------|
+| 1 | [Concurrent worker + proper stream recovery + DLQ](track-01-concurrent-worker.md) | Concurrency models, at-least-once recovery, poison-message handling, head-of-line blocking | **next — P0 (data-confirmed)** |
+| 1b | Webhook delivery throughput *(surfaced by exp 0001)* | Concurrent I/O-bound delivery, backpressure on a side-channel, decoupling fan-out from job completion | **new — P1** |
+| 2 | [Queue-depth autoscaling](track-02-autoscaling.md) | Backpressure, control loops, Little's Law, SLO-driven capacity | planned (after T1) |
+| 3 | [End-to-end tracing, SLOs & local load testing](track-03-observability-and-load.md) | Context propagation across async boundaries, the three pillars, SLO/SLI/error budgets, load-test methodology | **done ✅** |
+| 4 | [Multi-tenancy, auth & quotas](track-04-multitenancy-auth.md) | AuthN vs AuthZ, key rotation, the idempotency pattern, tenant isolation | planned |
+| 5 | [Production ingestion pipeline](track-05-ingestion.md) | Resumable/multipart uploads, pipeline stages, defense-in-depth, trust boundaries | planned |
+| 6 | [Adaptive streaming + CDN](track-06-adaptive-streaming.md) | ABR streaming, CDN cache/invalidation, edge auth, encoding cost/quality tradeoffs | planned |
+| 7 | [Data layer at scale](track-07-data-layer.md) | Table partitioning, CDC vs polling, index design under write load | **deferred — rescope to `webhook_deliveries`** |
+| 8 | [Resilience & correctness verification](track-08-resilience.md) | Failure-mode analysis, exactly-once in practice, replay attacks, chaos engineering | planned |
+
+> Track 3 is the only track with a full design doc checked in, because it was built
+> first. Now that it's done, every track below is **measurable**: implement, re-run
+> the same k6 profile, compare dashboards, and record an `experiments/NNNN-*.md`
+> writeup. "It scales" is a claim we can prove, not assert.
+
+## Recommended sequence (re-prioritized from exp 0001 data)
+
+1. **Track 1 — concurrent worker + DLQ + stream recovery.** **P0, data-confirmed.**
+   The single-threaded worker is the throughput ceiling (μ ≈ 1.1 jobs/s, queue → 2,544).
+   Biggest lever; self-contained. *Verify:* re-run `open --rate 10/s` — expect μ to
+   scale with the pool and the queue to stabilize → `experiments/0002`.
+2. **Track 1b — webhook delivery throughput.** **P1, newly surfaced.** Independent of
+   the worker: `webhook_pending` hit ~5,900 and never drained. Concurrent/batched
+   delivery + wire the unrecorded `webhook_delivery_*` metrics. Small, high-value.
+   *Verify:* `webhook_pending` drains under the same load.
+3. **Track 2 — autoscaling.** Needs a concurrent worker first; then scale it on the
+   queue-lag signal we already expose. Now directly measurable.
+4. **Track 4 — multi-tenancy + idempotency + auth.** The leap to "real users".
+5. **Track 6 — adaptive streaming + CDN.** The headline product feature.
+6. **Track 5 — ingestion.** Includes the small wins exp 0001 surfaced: abandoned-upload
+   lifecycle (~50% orphaned `media/raw/`) and the `/complete` MinIO-HEAD tail.
+7. **Track 7 — data layer.** **Deferred and rescoped.** DB/outbox have headroom today;
+   revisit when volume justifies, scoped first to `webhook_deliveries` churn (the one
+   polled table that actually strained) rather than blanket partitioning.
+8. **Track 8 — resilience & correctness.** Depth once the throughput tracks land.
+
+> **Track 3 follow-ups (do before the next experiment so p95s aren't distorted):**
+> wire the `webhook_delivery_*` and `storage_operation_*` metrics, add a fine-bucket
+> view to `db.query.duration`, and standardize histogram buckets across services.
+
+---
+
+## Track catalog (summaries)
+
+### Track 1 — Concurrent worker + proper stream recovery + DLQ
+**Gap:** one job at a time; a 3s video blocks a 200ms thumbnail. Recovery scans
+Postgres and re-`XADD`s instead of using consumer-group delivery state.
+**Move:** bounded worker pool (process pool for CPU-bound ffmpeg/Pillow vs async
+for I/O — *choosing which is the lesson*); honour `MAX_CONCURRENT_JOBS` as a
+semaphore; `XAUTOCLAIM`/`XPENDING` to reclaim dead-consumer messages; a
+**dead-letter stream** for messages past the attempt cap; priority lanes so small
+jobs don't queue behind large transcodes.
+**Teaches:** thread vs process vs async, the GIL, CPU vs I/O bound, at-least-once
+recovery, poison-message handling, head-of-line blocking.
+
+### Track 1b — Webhook delivery throughput *(surfaced by exp 0001)*
+**Gap:** the dispatcher polls every 2s, batch 50, and delivers webhooks with
+*synchronous* HTTP + retries on a single loop. Each job emits 3 events
+(`job.starting/started/done`), so insertion rate ≫ delivery rate — the load test
+drove `webhook_pending` to ~5,900 with no recovery. Delivery is also under-
+instrumented: `webhook_delivery_total/duration/failures` are defined but never
+recorded, so only the `pending` gauge revealed the backlog.
+**Move:** a bounded pool of concurrent delivery workers (I/O-bound → async/threads
+fits); decouple fan-out from job completion; wire the delivery metrics + a
+delivery-latency SLI. Optionally move webhook rows onto their own stream consumer
+rather than a DB poll.
+**Teaches:** concurrency for I/O-bound work, backpressure on a side-channel,
+decoupling producers from slow consumers, instrumenting before optimizing.
+
+### Track 2 — Queue-depth autoscaling
+**Gap:** static worker count; bursts grow latency unbounded, idle wastes capacity.
+**Move:** expose stream lag + oldest-message-age (extend the existing relay-lag
+metric); drive **KEDA** (k8s manifests already exist) to scale workers on lag;
+load-test the backlog → scale → drain cycle.
+**Teaches:** backpressure, control loops, latency- vs queue-depth-based scaling,
+Little's Law (L = λW), capacity planning.
+
+### Track 4 — Multi-tenancy, auth & quotas
+**Gap:** homegrown AES token (no expiry/rotation, shared key with webhook secrets);
+single bucket, path-prefixed; no idempotency keys (retried presign = duplicate asset).
+**Move:** OIDC/JWT (asymmetric keys, expiry, JWKS rotation) or scoped API keys;
+separate webhook-signing secret; org→project→asset model with repository-layer
+row scoping and per-tenant storage prefixes/credentials; **idempotency keys** on
+`presign`/`complete`; per-tenant **quotas + rate limits** with usage accounting.
+**Teaches:** authN vs authZ, key management/rotation, the idempotency pattern,
+tenant isolation, security blast-radius.
+
+### Track 5 — Production ingestion pipeline
+**Gap:** single presigned `PUT`, 500MB cap, no resumability, MIME-only validation,
+no scanning. Plus (from exp 0001) **no lifecycle for abandoned uploads** — ~50% of
+presigns never complete, orphaning `media/raw/` objects.
+**Move:** S3 **multipart/resumable** uploads with part-level retry; a validation
+stage with real content sniffing (`python-magic` is already a dep); optional
+**ClamAV** malware scan as a stage; dedup *before* full download via verified
+client-supplied hash; a TTL/lifecycle sweep for un-completed raw uploads.
+**Teaches:** large-file transfer, pipeline/stage design, defense-in-depth, trust
+boundaries (never trust client content-type).
+
+### Track 6 — Adaptive streaming + CDN
+**Gap:** one fixed 720p MP4 at hardcoded 2500kbps, served straight from MinIO.
+**Move:** generate an **HLS/DASH adaptive ladder** (multiple renditions + manifest —
+`variants.video.manifest_url` already exists in the schema); serve via **CDN** with
+signed URLs + cache-control; content-aware/per-title encoding decisions.
+**Teaches:** adaptive bitrate streaming, CDN cache strategy + invalidation, edge
+signed-URL access control, encoding cost/quality tradeoffs.
+
+### Track 7 — Data layer at scale *(deferred — see exp 0001)*
+**Gap:** `jobs`, `event_outbox`, `webhook_deliveries` polled and growing. The load
+test showed the DB and outbox have **headroom today** (18 ms mean query, 5/25
+connections, 0 outbox backlog), so blanket partitioning is premature — but
+`webhook_deliveries` is the one table that genuinely strained.
+**Move:** start narrow — partition/clean `webhook_deliveries` and replace its poll
+with `LISTEN/NOTIFY` or a stream consumer (overlaps Track 1b). Broaden to the other
+tables (monthly partitions; drop instead of DELETE; read replicas; CDC) only when
+volume justifies it.
+**Teaches:** partitioning, CDC vs polling, write-heavy index design, pool sizing.
+
+### Track 8 — Resilience & correctness verification
+**Gap:** unit + integration tests exist, but no proof of survival under failure/load.
+**Move:** **fault injection / chaos** (kill the worker mid-transcode, pause Redis,
+fill the disk — verify processed-once holds); **load tests** with latency budgets in
+CI; **webhook contract tests** + replay protection (sign a timestamp, reject stale
+deliveries — today a captured payload replays forever).
+**Teaches:** failure-mode analysis, exactly-once vs at-least-once in practice,
+replay attacks, reliability as a tested property.
diff --git a/docs/enhancements/track-03-handoff.md b/docs/enhancements/track-03-handoff.md
new file mode 100644
index 0000000..089c030
--- /dev/null
+++ b/docs/enhancements/track-03-handoff.md
@@ -0,0 +1,222 @@
+# Track 3 — Session Handoff (start here)
+
+**Purpose:** everything a fresh conversation needs to begin **Track 3
+(end-to-end tracing, SLOs & local load testing)** without prior context. Read
+this top to bottom, then open `track-03-observability-and-load.md` for the full
+design and phased plan. This doc is the *operational* companion: where things
+are, how to run them, and the landmines already discovered.
+
+---
+
+## 1. What MPiper is (60-second orientation)
+
+A media-processing pipeline: a **Go API** (`cmd/server`, `internal/`) accepts
+uploads and a **Python worker** (`worker/`) processes them. They communicate over
+**Redis Streams** (`media:jobs`). **Postgres** is the durable source of truth;
+**MinIO** (S3-compatible) stores objects. Webhooks notify clients of job
+lifecycle events.
+
+**Asset flow:**
+`POST /api/v1/storage/presign` → client `PUT`s file to MinIO →
+`GET /api/v1/assets/{id}/complete` (writes asset `uploaded` + job + outbox row +
+`job.starting` webhook rows in one tx) → **outbox relay** (1s poll) publishes to
+Redis → **worker** consumes → image (3 webp variants) or video (poster + 720p +
+preview) → variants written to MinIO + Postgres, asset `ready` → worker inserts
+`job.started`/`job.done` webhook rows → **dispatcher** (2s poll) delivers signed POSTs.
+
+---
+
+## 2. The Track 3 goal in one sentence
+
+Make one **trace per asset** that spans API → Redis → worker → ffmpeg (so queue
+wait and per-stage time are visible), define a small set of **SLOs**, and build a
+**local k6 load harness** + Grafana dashboards so we can saturate the system on a
+laptop and *see* where it bends. Full plan: `track-03-observability-and-load.md`.
+
+---
+
+## 3. Current telemetry state (verified in code)
+
+- **Go API:** OTel **traces + metrics**, exported OTLP to `otel-collector:4317`.
+  Tracer init in `internal/metrics/otel.go`; metric instruments in
+  `internal/metrics/metrics.go`.
+- **Python worker:** OTel **metrics only** (`worker/utils/metrics.py`, OTLP to
+  `otel-collector:4317`). **No tracer, no span creation, no context extraction.**
+- **The gap:** the Go side traces the HTTP request and `Enqueue`, but **never
+  injects a `traceparent`** into the Redis message or the outbox row. The worker
+  therefore starts fresh with no parent. The trace dies at the queue boundary.
+- **Observability stack** (`docker-compose.observability.yml`, configs in
+  `observability/`): OTel Collector (bridges `mpiper_net` ↔ `mpiper_obs_net`),
+  **Tempo** (traces), **Prometheus** (metrics), **Loki + Promtail** (logs),
+  **Grafana** (dashboards, anonymous admin). Collector pipeline: OTLP receiver →
+  Tempo (traces) + Prometheus exporter `:8889` (metrics).
+
+> Note: `CLAUDE.md` historically said the worker is "prometheus_client (not OTel)"
+> — that's **stale/wrong**; the worker uses OTel metrics. Don't trust that line.
+
+---
+
+## 4. Exact engineering targets for Phase 1 (close the trace gap)
+
+These are the precise seams to touch. Verify each before editing.
+
+**Inject context (Go):**
+- `internal/queue/queue.go` — `RedisQueue.Enqueue` builds the stream message
+  (a `map`); the worker reads its fields. Inject `traceparent` (and `tracestate`)
+  here using the OTel propagator, as top-level message field(s).
+- `internal/outbox/relay.go` — `tick()` unmarshals the outbox row payload and calls
+  `queue.Enqueue(ctx, payload)`. Because enqueue is **store-and-forward**, the
+  trace context must survive in the **outbox row** too: capture it when the row is
+  written in `internal/service/asset.go` (`MarkAssetUploaded`, the
+  `outboxRepo.InsertTx` call), persist it (extend `internal/models/outbox.go` +
+  `internal/repository/outbox_repo.go` + a migration), and re-inject on relay.
+- **Verify the global propagator is set** in `internal/metrics/otel.go`
+  (`otel.SetTextMapPropagator(propagation.TraceContext{})`). If missing, add it —
+  without it, injection is a no-op.
+
+**Extract + continue (Python):**
+- Add `worker/utils/tracing.py` mirroring `worker/utils/metrics.py` (tracer init,
+  OTLP exporter to the same endpoint). Find where `init_metrics(...)` is called and
+  init the tracer alongside it (same lifecycle).
+- `worker/consumer/consumer.py` — in `consume()`, after the message payload is
+  normalized (note: a `body` field, if present, is JSON-decoded and merged), read
+  `traceparent` and start the consumer span. Use a **child span with a link** to
+  the producer context (link is the correct primitive for queue fan-in; child span
+  keeps the Tempo waterfall readable).
+
+**Span the stages (Phase 2):**
+- `worker/processing/processor.py` — `process_asset_dispatch` (download, dedup check).
+- `worker/processing/images.py` — per-variant encode/upload.
+- `worker/processing/videos.py` — `run()` wraps each ffmpeg call (poster / transcode_720p / preview).
+- Stamp `trace_id`/`span_id` into worker + API structured logs for Loki↔Tempo linking.
+
+**Message format reminder:** the consumer accepts either `job_id` (canonical) or
+`asset_id`. The outbox payload (built in `asset.go`) currently carries `job_id`,
+`asset_id`, `event`, `timestamp`. Add trace context as additional field(s); don't
+break the existing keys.
+
+---
+
+## 5. Environment & topology facts (host = macOS)
+
+**Host ports → containers:**
+| Service | Host | Container | Notes |
+|---|---|---|---|
+| API | 5010 | 5010 | `/healthz`, `/api/v1/...` |
+| Postgres | 5433 | 5432 | user `mpiper`, db `mpiper`, pw `changeme` |
+| Redis | 6380 | 6379 | stream `media:jobs`, group `worker-group` |
+| MinIO API | 9000 | 9000 | bucket `mpiper` (anon download on) |
+| MinIO console | 9001 | 9001 | minioadmin / minioadmin |
+| Grafana | 3000 | 3000 | anon admin |
+| Prometheus | 9090 | 9090 | |
+| Tempo | 3200 | 3200 | OTLP in on 4317/4318 (obs net) |
+| webhook-receiver | 8888 | 8080 | overlay only |
+
+**Container names:** `mpiper-api`, `mpiper-worker`, `mpiper-postgres`,
+`mpiper-redis`, `mpiper-minio`, `mpiper-webhook-receiver`, `mpiper-otel-collector`,
+`mpiper-tempo`, `mpiper-prometheus`, `mpiper-grafana`, `mpiper-loki`.
+
+**Storage split endpoint (implemented):** `S3_ENDPOINT_URL=http://minio:9000`
+(internal I/O) vs `S3_PUBLIC_ENDPOINT_URL=http://localhost:9000` (presigned +
+public URLs). Don't undo this — host-run load tests depend on it.
+
+**Telemetry env (`.env.local`):** `OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317`,
+`OTEL_TLS_INSECURE=true`, `TRACE_SAMPLING_RATE` (default 0.1 in code — **set to
+1.0 locally** so every asset traces). `ENCRYPTION_KEY=0123456789abcdef0123456789abcdef`
+(32 bytes; used for auth tokens AND webhook secrets).
+
+---
+
+## 6. Runbook / command cheat sheet
+
+```bash
+# Bring up core + observability (+ webhooks if you want webhook traces too)
+docker compose -f docker-compose.yml -f docker-compose.observability.yml up -d --build
+# add: -f docker-compose.webhooks.yml   (for webhook receiver)
+
+# End-to-end smoke (host-run; image + video + webhooks; 23 checks)
+./scripts/demo-e2e.sh
+
+# Go: build / vet / tests  (tests/performance_suite_test.go FAILS unless PERF_TEST_URL set — ignore)
+go build ./... && go vet ./... && go test ./...
+
+# Worker tests: the local .venv (py3.14) LACKS psycopg_pool/pytest/cryptography.
+# Run them INSIDE the worker container instead:
+docker exec -w /app mpiper-worker python -m unittest discover -s worker/tests -p 'test_*.py' -v
+
+# Mint an auth token from the host (system python3 has `cryptography`; venv does not):
+TOKEN=$(python3 - <<'PY'
+import base64, os
+from cryptography.hazmat.primitives.ciphers.aead import AESGCM
+key=b"0123456789abcdef0123456789abcdef"; nonce=os.urandom(12)
+print(base64.urlsafe_b64encode(nonce+AESGCM(key).encrypt(nonce,b"demo-user",None)).rstrip(b"=").decode())
+PY
+)
+
+# Inspect DB
+docker exec mpiper-postgres psql -U mpiper -d mpiper -c "SELECT asset_id,status,type FROM assets ORDER BY created_at DESC LIMIT 5;"
+
+# Reset all state (assets/variants/objects accumulate across runs)
+docker compose -f docker-compose.yml -f docker-compose.observability.yml down -v
+
+# UIs: Grafana http://localhost:3000 · Prometheus http://localhost:9090 · Tempo via Grafana Explore
+```
+
+---
+
+## 7. Landmines (things that already bit, or will)
+
+- **Worker is single-threaded.** `MAX_CONCURRENT_JOBS` is in `worker/consumer/
+  config.py` but **never used**; `consume()` does `count=1` and processes inline.
+  This is the expected bottleneck Phase 5 should prove — don't "fix" it in Track 3.
+- **Recovery is homegrown.** A 2-min DB scan re-`XADD`s stale jobs; no
+  `XPENDING`/`XAUTOCLAIM`; poison messages are marked `failed` and dropped (no DLQ).
+  That's Track 1, not Track 3.
+- **Global propagator may be unset** in Go — injection silently no-ops without it. Check first.
+- **Sampling.** Code default `TRACE_SAMPLING_RATE=0.1`. Set 1.0 locally or you'll
+  lose most traces and think propagation is broken.
+- **Dedup hides work.** Identical fixtures dedup after the first asset → near-zero
+  work on repeats. The load harness must **fan out unique bytes** to measure real
+  per-job cost.
+- **Cardinality.** asset_id is fine as a *trace/span attribute*; **never** put it on
+  a *metric* label.
+- **Health check.** `cmd/server --health-check` is now a real `/healthz` probe
+  (was previously booting a second server and failing to bind 5010 → api unhealthy
+  → worker wouldn't start). If you change startup, keep that path lightweight.
+- **Rebuild after code changes.** api/worker run from built images:
+  `docker compose ... build api worker && docker compose ... up -d`.
+- **Local ≠ prod.** Trust bottleneck *location* and before/after deltas, not
+  absolute throughput numbers.
+
+---
+
+## 8. Suggested first-session scope
+
+Do **Phase 0 + Phase 1** together (highest value, gets a real cross-boundary trace fast):
+
+1. **Phase 0:** add `deploy.resources.limits` (cpu/mem) to `api` + `worker` in a
+   compose overlay; set `TRACE_SAMPLING_RATE=1.0`; bring up with the observability
+   overlay; capture a baseline `demo-e2e.sh` run and confirm spans land in Tempo.
+2. **Phase 1:** Go `traceparent` injection (enqueue + outbox row + migration) →
+   worker tracer + extraction in `consume()`. 
+
+**Acceptance:** open one asset in Grafana/Tempo and see a single trace from
+`POST /storage/presign` through `enqueue` → (visible queue-wait gap) → worker
+`consume` span. That alone is a satisfying, demoable win.
+
+Then continue with Phases 2–5 from the design doc (pipeline spans + log
+correlation → SLO recording rules + dashboards → k6 harness → first experiment
+writeup that names the worker bottleneck, feeding Track 1).
+
+---
+
+## 9. Repo / git state at handoff
+
+- Branch: `feat/webhook-notifications`; open **PR #18**.
+- Demo-readiness + split-endpoint work is committed (`9404c7a`) and pushed.
+- `docs/enhancements/` (this file + `README.md` + `track-03-observability-and-load.md`)
+  may be **uncommitted** — commit them at the start of the Track 3 session.
+- Key docs to read: `docs/enhancements/README.md` (catalog),
+  `track-03-observability-and-load.md` (plan), `docs/arch/*` (existing outbox/
+  reliability design notes), `CLAUDE.md` (repo conventions; note the stale worker-
+  telemetry line).
diff --git a/docs/enhancements/track-03-observability-and-load.md b/docs/enhancements/track-03-observability-and-load.md
new file mode 100644
index 0000000..43ff9c9
--- /dev/null
+++ b/docs/enhancements/track-03-observability-and-load.md
@@ -0,0 +1,214 @@
+# Track 3 — End-to-end tracing, SLOs & local load testing
+
+**Status:** planning · **Prereq:** none · **Unlocks:** makes every other track measurable
+
+## 1. Problem
+
+We can't improve what we can't see. Right now:
+
+- The **distributed trace breaks at the Redis boundary.** The Go API traces the
+  HTTP request and the `Enqueue` call, but it never injects a `traceparent` into
+  the stream message. The worker has OTel **metrics** but **no tracer** and does
+  no context extraction. So we cannot answer "for *this* asset, where did the 40
+  seconds go?" as a single trace spanning API → outbox → Redis → worker → ffmpeg →
+  variant write.
+- We have metrics but **no SLOs** — no agreed definition of "good", so no way to
+  say whether a change helped.
+- We have **no way to generate controlled load**, so we've never seen the system
+  bend. The single-threaded worker (Track 1) is an invisible bottleneck until
+  something pushes on it.
+
+The user's real question: *this is a local project — how do we test under load
+and actually understand what's working, failing, and needs optimization?*
+
+That question is answered in §3.
+
+## 2. Goals / Non-goals
+
+**Goals**
+- One trace per asset, end to end, across the queue boundary, viewable in Tempo.
+- A small, explicit set of **SLIs and SLOs** for the pipeline.
+- A repeatable **local load harness** that can saturate the system on a laptop.
+- Grafana dashboards (RED for the API, USE for the worker/host, a pipeline-latency
+  funnel, queue lag) wired so a metric spike links to an example trace (exemplars).
+- A written **bottleneck-analysis loop**: load → observe → locate → optimize → re-run → compare.
+
+**Non-goals**
+- Production-scale absolute numbers. Local results are **relative** — they reveal
+  bottlenecks and validate *direction*, not real-world capacity (see §7).
+- Alerting/paging infrastructure (note SLO burn-rate alerts as a follow-up).
+- Replacing the existing stack — we extend the bundled Tempo/Prometheus/Loki/Grafana.
+
+## 3. Can you load-test meaningfully on a laptop? Yes — here's the methodology
+
+The misconception is that load testing requires cloud scale. It doesn't. Load
+testing is about **saturating the system relative to its own capacity** and
+watching where it bends. A single-threaded worker on a laptop saturates at a
+handful of concurrent jobs — you can absolutely push it past that locally.
+
+The thing that makes local results *interpretable* is **pinning resources** so
+runs are reproducible and the bottleneck isn't hidden by spare laptop cores. We
+add CPU/memory limits to the `api` and `worker` containers (compose `deploy.
+resources.limits`) so "the worker is the bottleneck" is a stable, observable fact
+rather than something that moves run to run.
+
+**The loop we're building:**
+
+```
+            ┌─────────────────────────────────────────────┐
+            │ 1. Define SLIs/SLOs (what "good" means)       │
+            └───────────────┬─────────────────────────────┘
+                            ▼
+            ┌─────────────────────────────────────────────┐
+            │ 2. Instrument end-to-end (close the trace gap)│
+            └───────────────┬─────────────────────────────┘
+                            ▼
+   ┌────────────┐   generate    ┌─────────────────────────┐
+   │ k6 (host)  │ ────────────▶ │ MPiper (CPU-pinned)      │
+   │ load model │   presign→PUT │  API + worker            │
+   └────────────┘   →complete   └───────────┬─────────────┘
+        │ client-side metrics               │ app OTel traces+metrics
+        ▼                                    ▼
+   ┌─────────────────────────────────────────────────────┐
+   │ 3. Observe in Grafana: RED, USE, pipeline funnel,     │
+   │    queue lag — metric spike → exemplar trace in Tempo │
+   └───────────────┬─────────────────────────────────────┘
+                   ▼
+   ┌─────────────────────────────────────────────────────┐
+   │ 4. Locate bottleneck (trace waterfall + USE) →        │
+   │    optimize → re-run same profile → compare           │
+   └─────────────────────────────────────────────────────┘
+```
+
+### Load model (this is the subtle part)
+
+- **Closed model (fixed VUs):** N virtual users each loop presign→upload→complete
+  as fast as they can. Good for finding max throughput and saturation point.
+- **Open model (fixed arrival rate):** X new uploads/sec regardless of how fast the
+  system responds. Good for finding the **latency knee** and watching queue lag
+  grow when arrival rate > service rate (a live demonstration of Little's Law:
+  `L = λW`).
+
+We use **k6** run from the **host** (like `scripts/demo-e2e.sh`): the host can
+reach both the API (`localhost:5010`) and MinIO (`localhost:9000`), so k6 performs
+the *real* client flow — presign, `PUT` the file to the public endpoint, then
+`complete`. k6 uploads real fixtures (the existing image + `tests/test_assets/
+sample.mp4`), optionally fanning out copies with unique bytes to defeat content-hash
+dedup when we want true per-job work.
+
+Two views of the same run:
+- **Client view** (k6's own metrics): request rate, error rate, client-side
+  latency percentiles → remote-written to the bundled Prometheus.
+- **Server view** (MPiper's OTel): the pipeline's internal spans and metrics —
+  this is the point of the track, and what we'll mostly read.
+
+## 4. Design
+
+### 4.1 Close the trace gap (the core engineering work)
+
+1. **Inject context on enqueue (Go).** When the outbox relay (or `RedisQueue.
+   Enqueue`) publishes, inject the active span context as a `traceparent` field in
+   the stream message using the OTel propagator. The outbox row should carry the
+   trace context too (so the trace survives the store-and-forward hop).
+2. **Extract + continue on consume (Python).** Add an OTel **tracer** to the worker
+   (mirroring `worker/utils/metrics.py`). In `consume()`, extract `traceparent`
+   from the message and start the consumer span as a **child** (a span link is the
+   correct primitive for queue fan-in; we'll use a child span with a link to keep
+   the waterfall readable).
+3. **Span the pipeline stages.** Wrap `process_asset_dispatch`, download,
+   dedup-check, each image variant, and each ffmpeg invocation (poster / transcode /
+   preview) in spans with attributes (asset_id, type, bytes, role, ffmpeg rc).
+4. **Correlate logs.** Stamp `trace_id`/`span_id` into worker + API structured logs
+   so Loki ↔ Tempo cross-linking works in Grafana.
+
+End result: open an asset in Tempo and see `HTTP POST /presign … → enqueue →
+(time in queue) → worker consume → download → transcode_720p → write variant`,
+with the **queue wait time** visible as the gap between enqueue and consume.
+
+### 4.2 SLIs / SLOs (initial, deliberately small)
+
+| SLI | Definition | Initial SLO (local) |
+|-----|------------|---------------------|
+| Presign latency | p95 of `POST /storage/presign` | < 150 ms |
+| Image ready latency | p95 (complete → asset `ready`) for images | < 5 s |
+| Video ready latency | p95 (complete → asset `ready`) for videos | < 60 s |
+| Queue wait | p95 (enqueue → consume start) | < 2 s |
+| Job success rate | done / (done + failed) | > 99% |
+| Webhook delivery latency | p95 (event row created → delivered) | < 10 s |
+
+These come straight from spans/metrics we'll have. The numbers are starting
+guesses; the *point* is to make them explicit, then move them based on data.
+
+### 4.3 Dashboards (Grafana, provisioned in `observability/grafana/dashboards`)
+
+- **API — RED:** request **R**ate, **E**rror rate, **D**uration (p50/p95/p99) per route.
+- **Worker/host — USE:** CPU/mem **U**tilization, **S**aturation (queue depth,
+  in-flight jobs), **E**rrors. (cAdvisor/node metrics or the collector's own.)
+- **Pipeline funnel:** uploaded → processing → ready/failed counts + the
+  per-stage latency breakdown (from spans).
+- **Queue health:** stream length, oldest-pending age, outbox relay lag (metric
+  already exists), webhook pending gauge (already exists).
+- **Exemplars:** histogram panels link a bucket spike to a concrete Tempo trace.
+
+### 4.4 Bottleneck-analysis loop (documented runbook)
+
+For each experiment: fix a load profile, run it, then read in order — (1) is the
+SLO breached? (2) USE: is the worker CPU-saturated or queue-saturated? (3) open an
+exemplar trace: which span dominates? (4) form a hypothesis, change one thing,
+re-run the **same** profile, compare. Record results in an `experiments/` log so
+"the transcode span dropped from 38s→6s after X" is captured.
+
+## 5. Phased implementation plan
+
+Each phase is independently demoable.
+
+- **Phase 0 — Resource pinning & baseline.** Add `deploy.resources.limits` to api/
+  worker; bring up the observability overlay; capture a one-shot baseline with the
+  existing `demo-e2e.sh`. *Demo:* Grafana shows the run; numbers are reproducible.
+- **Phase 1 — Trace propagation.** Inject `traceparent` on enqueue (Go) + outbox
+  row; extract + continue in the worker; add the worker tracer. *Demo:* a single
+  Tempo trace spans API→worker for one asset, with visible queue wait.
+- **Phase 2 — Pipeline spans + log correlation.** Span dispatch/download/dedup/
+  each variant/each ffmpeg call; add trace IDs to logs. *Demo:* trace waterfall
+  shows per-stage timing; click a log line → its trace.
+- **Phase 3 — SLO recording rules + dashboards.** Prometheus recording rules for
+  the SLIs in §4.2; provision the four dashboards. *Demo:* a dashboard shows each
+  SLI vs its SLO target.
+- **Phase 4 — k6 load harness.** `loadtest/` with closed- and open-model scripts,
+  a host-run wrapper, fixture fan-out, and k6→Prometheus remote write. *Demo:*
+  `./loadtest/run.sh open --rate 5/s --duration 3m` drives the system; Grafana
+  shows queue lag climbing and the latency knee.
+- **Phase 5 — First experiment writeup.** Run a saturating profile, capture the
+  bottleneck (expected: the single-threaded worker), and write it up as the
+  motivating evidence for **Track 1**. *Demo:* `experiments/0001-worker-saturation.md`
+  with before numbers + the trace proving where time goes.
+
+## 6. How we'll know it works (acceptance)
+
+- A Tempo trace for any asset includes both API and worker spans, with queue wait
+  time visible.
+- Every SLI in §4.2 renders on a dashboard against its target.
+- `loadtest/run.sh` reproducibly drives the system into SLO breach, and the
+  responsible stage is identifiable from a trace within ~2 minutes of looking.
+- Phase 5 writeup names the bottleneck with evidence — the input to Track 1.
+
+## 7. Risks & honest caveats
+
+- **Local ≠ production.** Absolute numbers are not portable (laptop CPU, no network
+  latency, single-node Redis/PG). Treat results as **relative**: bottleneck
+  location and before/after deltas are trustworthy; "we do N uploads/sec" is not.
+- **Noisy neighbor.** k6, the app, and the observability stack share the laptop.
+  Pin app resources and keep k6 modest; consider running k6 with `--throw` budgets.
+- **Container CPU limits change behavior** (e.g. ffmpeg threads). That's fine — it's
+  what makes runs comparable — but document the limits with each experiment.
+- **Trace cardinality / sampling.** Asset-ID attributes are high-cardinality on
+  *traces* (OK) but must never become metric labels. Keep `TRACE_SAMPLING_RATE`
+  in mind; sample at 100% locally, lower in prod.
+- **Dedup hides work.** Identical fixtures dedup after first processing; the load
+  harness must fan out unique bytes when measuring real per-job cost.
+
+## 8. Follow-ups (out of scope here)
+
+- SLO **burn-rate alerting** (multi-window) once SLOs stabilize.
+- Continuous profiling (Pyroscope) to attribute CPU *inside* a span.
+- CI smoke load test with a latency budget (feeds Track 8).
diff --git a/experiments/0001-worker-saturation.md b/experiments/0001-worker-saturation.md
new file mode 100644
index 0000000..8214ee6
--- /dev/null
+++ b/experiments/0001-worker-saturation.md
@@ -0,0 +1,117 @@
+# Experiment 0001 — Worker saturation under load
+
+**Date:** 2026-06-30 · **Track:** 3 (observability & load) · **Feeds:** Track 1 (concurrent worker)
+**Status:** complete
+
+## Hypothesis
+
+The Python worker is single-threaded (`consume()` reads one message, `count=1`,
+and processes it inline; `MAX_CONCURRENT_JOBS` exists in config but is unused).
+Under sustained upload load the worker — not the API — should be the bottleneck,
+with the Redis stream growing without bound once arrival rate exceeds the
+worker's service rate (Little's Law, `L = λW`).
+
+## Setup (record this with every run)
+
+- **Resource pinning** (`docker-compose.loadtest.yml`): `api` = 1.0 CPU / 512 MB,
+  `worker` = **1.0 CPU** / 1 GB. The single-CPU pin makes the bottleneck a stable,
+  observable fact rather than something that moves with spare laptop cores.
+- **Sampling:** `TRACE_SAMPLING_RATE=1.0` (every asset traced).
+- **Stack:** core + observability + loadtest overlays, all up.
+- **Workload:** images only, unique bytes per iteration (dedup defeated — see
+  `loadtest/lib.js`). Fixture `worker/tests/test_assets/image.jpg`, 3 webp
+  variants per asset.
+- **Profile:** open model, `./loadtest/run.sh open --rate 10/s --duration 60s`
+  (fixed arrival rate; λ = 10 uploads/s).
+
+> Local results are **relative**. Trust the bottleneck location and the
+> before/after deltas, not the absolute throughput — laptop CPU, no network
+> latency, single-node Redis/Postgres.
+
+## Method (the loop)
+
+1. Is an SLO breached? 2. USE — is the worker CPU- or queue-saturated?
+3. Open an exemplar trace — which span dominates? 4. Form a hypothesis, change
+one thing, re-run the **same** profile, compare.
+
+## Results (before — no optimisation yet)
+
+| Signal | Value | Source |
+|--------|-------|--------|
+| Arrival rate (λ) | 10.0 uploads/s | k6 `mpiper_assets_submitted` |
+| Worker service rate (μ) | **1.13 jobs/s** | `rate(mpiper_mpiper_job_processing_success_total[2m])` |
+| Mean asset processing time | 0.81 s | `…_duration_seconds_sum / …_count` |
+| Queue depth before → after | 3985 → 4370 (↑) | `sli:queue_depth:current` |
+| Worker CPU | **98.5 %** (pinned at 1 CPU) | `docker stats` |
+| API CPU | 0.4 % | `docker stats` |
+| Presign p95 (API) | 48 ms (SLO < 150 ms ✅) | `sli:presign_latency_seconds:p95` |
+| Job success rate | 1.0 (SLO > 99 % ✅) | `sli:job_success_ratio:ratio_rate5m` |
+
+**Reading:** λ (10/s) ≫ μ (1.13/s). The queue grows monotonically; the system is
+**unstable for any arrival rate above ≈ 1.1 uploads/s**. The API is essentially
+idle (0.4 % CPU, presign well inside SLO) while the worker is pinned at 98.5 %.
+The bottleneck is unambiguously the worker, and specifically its
+**single-threaded, one-job-at-a-time** processing loop — not CPU work that is
+inherently slow (a single image is ~0.8 s), but the complete absence of
+concurrency.
+
+## Trace evidence (where the time goes)
+
+With the trace gap now closed (Track 3, Phase 1), one asset is a single trace
+from the API through the queue into the worker — example, 19 spans:
+
+```
+/api/v1/assets/{id}/complete                 (API HTTP request)
+└ AssetHandler.MarkAssetUploaded
+  └ AssetService.MarkAssetUploaded
+    ├ StorageClient.GetObjectAttrs → S3.GetObjectAttrs
+    └ Database.Transaction
+      ├ AssetRepo.MarkAssetUploadedTx
+      ├ AssetRepo.InsertProcessAssetJobTx
+      └ OutboxRepo.InsertTx
+        └ outbox.publish                       (relay re-activates stored context)
+          └ RedisQueue.Enqueue                 (injects traceparent into the message)
+            ├ RedisQueue.doXAddWithRetry
+            └ worker.consume                   (── crosses the Redis boundary ──)
+              └ process.dispatch
+                ├ process.download
+                ├ process.dedup_check
+                └ image.variant × 3
+```
+
+The **gap between `RedisQueue.Enqueue` and `worker.consume`** is the queue wait —
+the time an asset spends backed up behind the single worker. Under this profile
+that gap dominates end-to-end latency, and it grows for every asset because the
+backlog only ever increases. The in-worker stages (download, dedup, 3 variants)
+are individually fast; the cost is waiting for a free worker, not the work itself.
+
+## Conclusion
+
+The single-threaded worker is the bottleneck, with a service rate of ~1.1
+jobs/s. The pipeline cannot keep up with anything beyond a trickle of uploads,
+and the deficit manifests as an unbounded Redis backlog and ever-growing
+queue-wait latency — while the API and host CPU sit idle. This is the motivating
+evidence for **Track 1 (concurrent worker + stream recovery + DLQ)**: honour
+`MAX_CONCURRENT_JOBS` as a real concurrency limit (process pool for the
+CPU-bound Pillow/ffmpeg work) so μ scales with available cores instead of being
+fixed at one.
+
+## Reproduce
+
+```bash
+docker compose -f docker-compose.yml -f docker-compose.observability.yml \
+  -f docker-compose.loadtest.yml up -d --build
+./loadtest/run.sh open --rate 10/s --duration 60s
+# Grafana http://localhost:3000 → MPiper folder:
+#   - "Worker / App Saturation (USE)": queue depth climbing, in-flight pinned
+#   - "Pipeline Funnel": ready/s flat at ~1.1 while uploaded/s tracks arrival
+# Tempo (Explore): TraceQL `{ name="worker.consume" }` → open one → see the
+#   enqueue→consume queue-wait gap and the per-stage breakdown.
+```
+
+## Next experiment
+
+After Track 1 lands a bounded worker pool, re-run this **exact** profile and
+compare: μ should rise roughly with the pool size (until CPU-bound), queue depth
+should stabilise instead of growing, and the enqueue→consume gap should shrink.
+Record results as `0002-concurrent-worker.md`.
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 054436e..7aaf5f6 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -2,6 +2,7 @@ package metrics
 
 import (
 	"context"
+	"database/sql"
 	"runtime"
 	"time"
 
@@ -40,13 +41,16 @@ type Metrics struct {
 	StorageOperationTotal    metric.Int64Counter
 	StorageOperationErrors   metric.Int64Counter
 
-	DBQueryDuration     metric.Float64Histogram
-	DBQueryTotal        metric.Int64Counter
-	DBQueryErrors       metric.Int64Counter
-	DBConnectionsActive metric.Int64UpDownCounter
-	DBConnectionsIdle   metric.Int64UpDownCounter
-	DBTransactionTotal  metric.Int64Counter
-	DBTransactionErrors metric.Int64Counter
+	DBQueryDuration        metric.Float64Histogram
+	DBQueryTotal           metric.Int64Counter
+	DBQueryErrors          metric.Int64Counter
+	DBConnectionsActive    metric.Int64ObservableGauge
+	DBConnectionsIdle      metric.Int64ObservableGauge
+	DBConnectionsOpen      metric.Int64ObservableGauge
+	DBConnectionsMaxOpen   metric.Int64ObservableGauge
+	DBConnectionsWaitCount metric.Int64ObservableGauge
+	DBTransactionTotal     metric.Int64Counter
+	DBTransactionErrors    metric.Int64Counter
 
 	QueueMessagePublished metric.Int64Counter
 	QueueMessageConsumed  metric.Int64Counter
@@ -83,6 +87,22 @@ func (m *Metrics) RegisterQueueDepthFunc(fn func(context.Context) (int64, error)
 	return err
 }
 
+// RegisterDBStatsFunc wires sql.DBStats (connection-pool stats) to the DB
+// connection gauges. fn typically returns db.Stats(). One callback observes all
+// pool gauges from a single stats snapshot so they stay mutually consistent.
+func (m *Metrics) RegisterDBStatsFunc(fn func() sql.DBStats) error {
+	_, err := m.meter.RegisterCallback(func(_ context.Context, o metric.Observer) error {
+		s := fn()
+		o.ObserveInt64(m.DBConnectionsActive, int64(s.InUse))
+		o.ObserveInt64(m.DBConnectionsIdle, int64(s.Idle))
+		o.ObserveInt64(m.DBConnectionsOpen, int64(s.OpenConnections))
+		o.ObserveInt64(m.DBConnectionsMaxOpen, int64(s.MaxOpenConnections))
+		o.ObserveInt64(m.DBConnectionsWaitCount, s.WaitCount)
+		return nil
+	}, m.DBConnectionsActive, m.DBConnectionsIdle, m.DBConnectionsOpen, m.DBConnectionsMaxOpen, m.DBConnectionsWaitCount)
+	return err
+}
+
 // RegisterOutboxPendingFunc wires a callback to the OutboxPendingGauge.
 func (m *Metrics) RegisterOutboxPendingFunc(fn func(context.Context) (int64, error)) error {
 	_, err := m.meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
@@ -146,7 +166,9 @@ func InitMetrics(ctx context.Context, logger *zap.Logger) (*Metrics, func(contex
 
 	mp := sdkmetric.NewMeterProvider(
 		sdkmetric.WithResource(res),
-		sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exp)),
+		// 15s export interval (matches the worker) so RED/SLO dashboards stay
+		// responsive under load; the SDK default of 60s lags the 5m rate window.
+		sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exp, sdkmetric.WithInterval(15*time.Second))),
 		sdkmetric.WithView(
 			sdkmetric.NewView(
 				sdkmetric.Instrument{Name: "http.server.request.duration", Kind: sdkmetric.InstrumentKindHistogram},
@@ -157,6 +179,18 @@ func InitMetrics(ctx context.Context, logger *zap.Logger) (*Metrics, func(contex
 				},
 			),
 		),
+		sdkmetric.WithView(
+			// Sub-second-resolution buckets for the queue lag histogram; the
+			// default buckets are too coarse and inflate the queue-wait SLI.
+			sdkmetric.NewView(
+				sdkmetric.Instrument{Name: "queue.processing.lag", Kind: sdkmetric.InstrumentKindHistogram},
+				sdkmetric.Stream{
+					Aggregation: sdkmetric.AggregationExplicitBucketHistogram{
+						Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5},
+					},
+				},
+			),
+		),
 	)
 
 	otel.SetMeterProvider(mp)
@@ -280,16 +314,31 @@ func initDatabaseMetrics(m *Metrics, meter metric.Meter, logger *zap.Logger) {
 	if err != nil {
 		logger.Sugar().Fatalf("Failed to create DB query errors: %v", err)
 	}
-	m.DBConnectionsActive, err = meter.Int64UpDownCounter("db.connections.active",
-		metric.WithDescription("Number of active database connections"), metric.WithUnit("{connection}"))
+	m.DBConnectionsActive, err = meter.Int64ObservableGauge("db.connections.active",
+		metric.WithDescription("Number of in-use database connections"), metric.WithUnit("{connection}"))
 	if err != nil {
 		logger.Sugar().Fatalf("Failed to create DB active connections: %v", err)
 	}
-	m.DBConnectionsIdle, err = meter.Int64UpDownCounter("db.connections.idle",
+	m.DBConnectionsIdle, err = meter.Int64ObservableGauge("db.connections.idle",
 		metric.WithDescription("Number of idle database connections"), metric.WithUnit("{connection}"))
 	if err != nil {
 		logger.Sugar().Fatalf("Failed to create DB idle connections: %v", err)
 	}
+	m.DBConnectionsOpen, err = meter.Int64ObservableGauge("db.connections.open",
+		metric.WithDescription("Number of open database connections (in-use + idle)"), metric.WithUnit("{connection}"))
+	if err != nil {
+		logger.Sugar().Fatalf("Failed to create DB open connections: %v", err)
+	}
+	m.DBConnectionsMaxOpen, err = meter.Int64ObservableGauge("db.connections.max_open",
+		metric.WithDescription("Configured max open database connections (0 = unlimited)"), metric.WithUnit("{connection}"))
+	if err != nil {
+		logger.Sugar().Fatalf("Failed to create DB max open connections: %v", err)
+	}
+	m.DBConnectionsWaitCount, err = meter.Int64ObservableGauge("db.connections.wait_count",
+		metric.WithDescription("Cumulative count of connection waits (pool contention)"), metric.WithUnit("{wait}"))
+	if err != nil {
+		logger.Sugar().Fatalf("Failed to create DB wait count: %v", err)
+	}
 	m.DBTransactionTotal, err = meter.Int64Counter("db.transaction.total",
 		metric.WithDescription("Total number of database transactions"), metric.WithUnit("{transaction}"))
 	if err != nil {
diff --git a/internal/middleware/logging.go b/internal/middleware/logging.go
index c046ab2..7c453be 100644
--- a/internal/middleware/logging.go
+++ b/internal/middleware/logging.go
@@ -37,6 +37,11 @@ func LoggerMiddleware(l *zap.Logger) func(next http.Handler) http.Handler {
 				zap.String("proto", r.Proto),
 			)
 
+			// Stamp trace_id/span_id from the active span (TracingMiddleware runs
+			// before this) so request logs cross-link to their Tempo trace and
+			// any handler/service using the context logger inherits the IDs.
+			reqLogger = applogger.WithTrace(r.Context(), reqLogger)
+
 			ctx := applogger.WithLogger(r.Context(), reqLogger)
 			r = r.WithContext(ctx)
 
diff --git a/internal/middleware/metrics.go b/internal/middleware/metrics.go
index 5e02ae4..542c9be 100644
--- a/internal/middleware/metrics.go
+++ b/internal/middleware/metrics.go
@@ -32,43 +32,51 @@ func MetricsMiddleware(m *metrics.Metrics) func(http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			start := time.Now()
 
-			route := chi.RouteContext(r.Context()).RoutePattern()
-			if route == "" {
-				route = "unknown"
-			}
-
 			wrapped := &metricsResponseWriter{ResponseWriter: w, statusCode: http.StatusOK}
 
-			attrs := []attribute.KeyValue{
-				attribute.String("http.method", r.Method),
-				attribute.String("http.route", route),
-			}
+			// In-flight gauge: keyed by method only. The route pattern is not yet
+			// known here (chi populates it during routing, after this middleware),
+			// so it would be "unknown"; using it on +1/-1 still nets to zero but
+			// adds no value. Per-route labels are applied post-routing below.
+			inflightAttrs := []attribute.KeyValue{attribute.String("http.method", r.Method)}
 
 			if m != nil {
-				m.HTTPActiveRequests.Add(r.Context(), 1, metric.WithAttributes(attrs...))
-				defer m.HTTPActiveRequests.Add(r.Context(), -1, metric.WithAttributes(attrs...))
+				m.HTTPActiveRequests.Add(r.Context(), 1, metric.WithAttributes(inflightAttrs...))
+				defer m.HTTPActiveRequests.Add(r.Context(), -1, metric.WithAttributes(inflightAttrs...))
 			}
 
 			defer func() {
 				if rec := recover(); rec != nil {
 					wrapped.statusCode = http.StatusInternalServerError
-					recordHTTPMetrics(m, r, wrapped, start, attrs)
+					recordHTTPMetrics(m, r, wrapped, start)
 					panic(rec)
 				}
 			}()
 
 			next.ServeHTTP(wrapped, r)
-			recordHTTPMetrics(m, r, wrapped, start, attrs)
+			recordHTTPMetrics(m, r, wrapped, start)
 		})
 	}
 }
 
-func recordHTTPMetrics(m *metrics.Metrics, r *http.Request, w *metricsResponseWriter, start time.Time, baseAttrs []attribute.KeyValue) {
+func recordHTTPMetrics(m *metrics.Metrics, r *http.Request, w *metricsResponseWriter, start time.Time) {
 	if m == nil {
 		return
 	}
+	// chi populates the matched route pattern during routing, so it is only
+	// available now (after ServeHTTP). Reading it earlier yields "" — the source
+	// of the previous "unknown" http_route label that broke per-route SLOs.
+	route := chi.RouteContext(r.Context()).RoutePattern()
+	if route == "" {
+		route = "unknown"
+	}
+
 	duration := time.Since(start).Seconds()
-	attrs := append(baseAttrs, attribute.Int("http.status_code", w.statusCode))
+	attrs := []attribute.KeyValue{
+		attribute.String("http.method", r.Method),
+		attribute.String("http.route", route),
+		attribute.Int("http.status_code", w.statusCode),
+	}
 
 	m.HTTPRequestDuration.Record(r.Context(), duration, metric.WithAttributes(attrs...))
 	m.HTTPRequestCount.Add(r.Context(), 1, metric.WithAttributes(attrs...))
diff --git a/internal/models/outbox.go b/internal/models/outbox.go
index 297883c..58647d8 100644
--- a/internal/models/outbox.go
+++ b/internal/models/outbox.go
@@ -13,6 +13,12 @@ type OutboxEvent struct {
 	JobID       *int64          `db:"job_id"`
 	Event       string          `db:"event"`
 	Payload     json.RawMessage `db:"payload"`
+	// Traceparent carries the W3C trace context captured when the row was
+	// written, so the distributed trace survives the outbox store-and-forward
+	// hop. The relay re-activates it before publishing to Redis. Nullable:
+	// rows written before this column existed (or without an active span) have
+	// no trace context.
+	Traceparent *string         `db:"traceparent"`
 	Status      string          `db:"status"`
 	Attempts    int             `db:"attempts"`
 	MaxAttempts int             `db:"max_attempts"`
diff --git a/internal/outbox/relay.go b/internal/outbox/relay.go
index ae485b5..6e2f8a8 100644
--- a/internal/outbox/relay.go
+++ b/internal/outbox/relay.go
@@ -8,6 +8,11 @@ import (
 	"github.com/rndmcodeguy20/mpiper/internal/metrics"
 	"github.com/rndmcodeguy20/mpiper/internal/queue"
 	"github.com/rndmcodeguy20/mpiper/internal/repository"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/trace"
 	"go.uber.org/zap"
 )
 
@@ -17,12 +22,13 @@ type Relay struct {
 	queue    queue.Queue
 	logger   *zap.Logger
 	m        *metrics.Metrics
+	tracer   trace.Tracer
 	interval time.Duration
 	batch    int
 }
 
 func NewRelay(repo repository.OutboxRepository, q queue.Queue, logger *zap.Logger, m *metrics.Metrics, interval time.Duration, batch int) *Relay {
-	return &Relay{repo: repo, queue: q, logger: logger, m: m, interval: interval, batch: batch}
+	return &Relay{repo: repo, queue: q, logger: logger, m: m, tracer: otel.Tracer("mpiper-api"), interval: interval, batch: batch}
 }
 
 // Start runs the relay loop until ctx is cancelled. It finishes the in-flight batch before returning.
@@ -71,8 +77,26 @@ func (r *Relay) tick(ctx context.Context) {
 			continue
 		}
 
-		if _, err := r.queue.Enqueue(ctx, payload); err != nil {
+		// Re-activate the producer's trace context (captured when the row was
+		// written) so the publish + enqueue spans rejoin the original request
+		// trace instead of starting a disconnected root. tick() runs on a
+		// background ticker context, so without this the trace would break here.
+		publishCtx := ctx
+		if row.Traceparent != nil && *row.Traceparent != "" {
+			carrier := propagation.MapCarrier{"traceparent": *row.Traceparent}
+			publishCtx = otel.GetTextMapPropagator().Extract(ctx, carrier)
+		}
+		publishCtx, span := r.tracer.Start(publishCtx, "outbox.publish")
+		span.SetAttributes(
+			attribute.Int64("outbox.row_id", row.ID),
+			attribute.String("event", row.Event),
+		)
+
+		if _, err := r.queue.Enqueue(publishCtx, payload); err != nil {
 			r.logger.Warn("outbox relay: enqueue failed", zap.Int64("id", row.ID), zap.Error(err))
+			span.RecordError(err)
+			span.SetStatus(codes.Error, "enqueue failed")
+			span.End()
 			_ = r.repo.IncrementAttempts(ctx, row.ID, err.Error())
 			if row.Attempts+1 >= row.MaxAttempts {
 				_ = r.repo.MarkFailed(ctx, row.ID, err.Error())
@@ -82,6 +106,7 @@ func (r *Relay) tick(ctx context.Context) {
 			}
 			continue
 		}
+		span.End()
 
 		publishedIDs = append(publishedIDs, row.ID)
 	}
diff --git a/internal/outbox/relay_trace_test.go b/internal/outbox/relay_trace_test.go
new file mode 100644
index 0000000..9fc7f54
--- /dev/null
+++ b/internal/outbox/relay_trace_test.go
@@ -0,0 +1,124 @@
+package outbox
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/rndmcodeguy20/mpiper/internal/models"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/trace"
+	"go.uber.org/zap"
+)
+
+// fakeOutboxRepo is an in-memory OutboxRepository for white-box relay tests.
+type fakeOutboxRepo struct {
+	pending     []models.OutboxEvent
+	published   []int64
+	incremented []int64
+	failed      []int64
+}
+
+func (f *fakeOutboxRepo) InsertTx(_ context.Context, _ *sql.Tx, _ models.OutboxEvent) error {
+	return nil
+}
+func (f *fakeOutboxRepo) FetchPendingBatch(_ context.Context, _ int) ([]models.OutboxEvent, error) {
+	out := f.pending
+	f.pending = nil // single tick
+	return out, nil
+}
+func (f *fakeOutboxRepo) MarkPublished(_ context.Context, ids []int64) error {
+	f.published = append(f.published, ids...)
+	return nil
+}
+func (f *fakeOutboxRepo) IncrementAttempts(_ context.Context, id int64, _ string) error {
+	f.incremented = append(f.incremented, id)
+	return nil
+}
+func (f *fakeOutboxRepo) MarkFailed(_ context.Context, id int64, _ string) error {
+	f.failed = append(f.failed, id)
+	return nil
+}
+func (f *fakeOutboxRepo) DeletePublishedBefore(_ context.Context, _ time.Time) (int64, error) {
+	return 0, nil
+}
+func (f *fakeOutboxRepo) CountPending(_ context.Context) (int64, error) { return 0, nil }
+
+// capturingQueue records the context handed to Enqueue.
+type capturingQueue struct {
+	gotCtx     context.Context
+	gotPayload map[string]interface{}
+}
+
+func (q *capturingQueue) Enqueue(ctx context.Context, payload map[string]interface{}) (string, error) {
+	q.gotCtx = ctx
+	q.gotPayload = payload
+	return "1-0", nil
+}
+
+func TestRelay_ReactivatesStoredTraceContext(t *testing.T) {
+	otel.SetTextMapPropagator(propagation.TraceContext{})
+
+	// Build a known producer span context and serialize it as a traceparent.
+	traceID, _ := trace.TraceIDFromHex("0af7651916cd43dd8448eb211c80319c")
+	spanID, _ := trace.SpanIDFromHex("b7ad6b7169203331")
+	sc := trace.NewSpanContext(trace.SpanContextConfig{
+		TraceID:    traceID,
+		SpanID:     spanID,
+		TraceFlags: trace.FlagsSampled,
+	})
+	carrier := propagation.MapCarrier{}
+	otel.GetTextMapPropagator().Inject(
+		trace.ContextWithSpanContext(context.Background(), sc), carrier)
+	tp := carrier.Get("traceparent")
+	if tp == "" {
+		t.Fatal("failed to build traceparent")
+	}
+
+	payload, _ := json.Marshal(map[string]interface{}{"asset_id": uuid.New().String()})
+	repo := &fakeOutboxRepo{pending: []models.OutboxEvent{
+		{ID: 7, Event: "asset_uploaded", Payload: payload, Traceparent: &tp, MaxAttempts: 5},
+	}}
+	q := &capturingQueue{}
+
+	relay := NewRelay(repo, q, zap.NewNop(), nil, time.Second, 100)
+	relay.tick(context.Background())
+
+	if q.gotCtx == nil {
+		t.Fatal("Enqueue was not called")
+	}
+	gotSC := trace.SpanContextFromContext(q.gotCtx)
+	if !gotSC.IsValid() {
+		t.Fatal("expected a valid span context passed to Enqueue")
+	}
+	if gotSC.TraceID() != traceID {
+		t.Fatalf("trace id not propagated: want %s got %s", traceID, gotSC.TraceID())
+	}
+	if len(repo.published) != 1 || repo.published[0] != 7 {
+		t.Fatalf("expected row 7 marked published, got %v", repo.published)
+	}
+}
+
+func TestRelay_NoTraceparentStillPublishes(t *testing.T) {
+	otel.SetTextMapPropagator(propagation.TraceContext{})
+
+	payload, _ := json.Marshal(map[string]interface{}{"asset_id": uuid.New().String()})
+	repo := &fakeOutboxRepo{pending: []models.OutboxEvent{
+		{ID: 9, Event: "asset_uploaded", Payload: payload, MaxAttempts: 5},
+	}}
+	q := &capturingQueue{}
+
+	relay := NewRelay(repo, q, zap.NewNop(), nil, time.Second, 100)
+	relay.tick(context.Background())
+
+	if q.gotCtx == nil {
+		t.Fatal("Enqueue was not called")
+	}
+	if len(repo.published) != 1 || repo.published[0] != 9 {
+		t.Fatalf("expected row 9 marked published, got %v", repo.published)
+	}
+}
diff --git a/internal/queue/queue.go b/internal/queue/queue.go
index 3197f11..2f4ccfc 100644
--- a/internal/queue/queue.go
+++ b/internal/queue/queue.go
@@ -15,6 +15,7 @@ import (
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/codes"
 	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/propagation"
 	"go.opentelemetry.io/otel/trace"
 )
 
@@ -126,6 +127,18 @@ func (rq *RedisQueue) Enqueue(ctx context.Context, payload map[string]interface{
 		"body": string(body),
 	}
 
+	// Inject the active trace context as top-level message fields so the worker
+	// can extract it and continue the trace across the queue boundary. The
+	// propagator writes traceparent (and tracestate when present); we copy them
+	// out as separate stream fields alongside body, preserving existing keys.
+	carrier := propagation.MapCarrier{}
+	otel.GetTextMapPropagator().Inject(ctx, carrier)
+	for _, k := range []string{"traceparent", "tracestate", "baggage"} {
+		if v := carrier.Get(k); v != "" {
+			streamEntry[k] = v
+		}
+	}
+
 	args := &redis.XAddArgs{
 		Stream: rq.options.QueueName,
 		Values: streamEntry,
diff --git a/internal/repository/outbox_repo.go b/internal/repository/outbox_repo.go
index 2ed7d3b..2676bb2 100644
--- a/internal/repository/outbox_repo.go
+++ b/internal/repository/outbox_repo.go
@@ -32,8 +32,8 @@ func NewOutboxRepository(db *sqlx.DB, logger *zap.Logger) OutboxRepository {
 
 func (r *outboxRepo) InsertTx(ctx context.Context, tx *sql.Tx, event models.OutboxEvent) error {
 	_, err := tx.ExecContext(ctx,
-		`INSERT INTO event_outbox (aggregate_id, job_id, event, payload, max_attempts) VALUES ($1, $2, $3, $4, $5)`,
-		event.AggregateID, event.JobID, event.Event, event.Payload, event.MaxAttempts,
+		`INSERT INTO event_outbox (aggregate_id, job_id, event, payload, traceparent, max_attempts) VALUES ($1, $2, $3, $4, $5, $6)`,
+		event.AggregateID, event.JobID, event.Event, event.Payload, event.Traceparent, event.MaxAttempts,
 	)
 	return err
 }
@@ -41,7 +41,7 @@ func (r *outboxRepo) InsertTx(ctx context.Context, tx *sql.Tx, event models.Outb
 func (r *outboxRepo) FetchPendingBatch(ctx context.Context, limit int) ([]models.OutboxEvent, error) {
 	var rows []models.OutboxEvent
 	err := r.db.SelectContext(ctx, &rows,
-		`SELECT id, aggregate_id, job_id, event, payload, status, attempts, max_attempts, last_error, created_at, published_at
+		`SELECT id, aggregate_id, job_id, event, payload, traceparent, status, attempts, max_attempts, last_error, created_at, published_at
 		 FROM event_outbox WHERE status = 'pending' ORDER BY id LIMIT $1 FOR UPDATE SKIP LOCKED`, limit)
 	return rows, err
 }
diff --git a/internal/router/router.go b/internal/router/router.go
index 7d44fea..1461967 100644
--- a/internal/router/router.go
+++ b/internal/router/router.go
@@ -105,9 +105,9 @@ func NewRouter(cfg config.EnvConfig, db *sqlx.DB, m *metrics.Metrics) *chi.Mux {
 		AllowCredentials: true,
 		MaxAge:           300,
 	}))
+	r.Use(appMiddleware.TracingMiddleware)
 	r.Use(appMiddleware.LoggerMiddleware(logger))
 	r.Use(middleware.Timeout(MiddlewareTimeout))
-	r.Use(appMiddleware.TracingMiddleware)
 	r.Use(appMiddleware.MetricsMiddleware(m))
 	r.Use(middleware.Compress(5))
 	r.Use(appMiddleware.SlowRequestMiddleware(logger, 2*time.Second))
diff --git a/internal/service/asset.go b/internal/service/asset.go
index 86bb684..59fc4d5 100644
--- a/internal/service/asset.go
+++ b/internal/service/asset.go
@@ -19,6 +19,7 @@ import (
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/codes"
 	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/propagation"
 	"go.uber.org/zap"
 )
 
@@ -258,11 +259,23 @@ func (s *assetService) MarkAssetUploaded(ctx context.Context, assetID uuid.UUID)
 		"event":     "asset_uploaded",
 		"timestamp": time.Now().UTC().Format(time.RFC3339),
 	})
+
+	// Capture the active trace context so it survives the outbox store-and-forward
+	// hop. The relay (running on a background ticker context) re-activates this
+	// before publishing to Redis, keeping the whole pipeline in one trace.
+	carrier := propagation.MapCarrier{}
+	otel.GetTextMapPropagator().Inject(ctxOutbox, carrier)
+	var traceparent *string
+	if tp := carrier.Get("traceparent"); tp != "" {
+		traceparent = &tp
+	}
+
 	err = s.outboxRepo.InsertTx(ctxOutbox, tx, models.OutboxEvent{
 		AggregateID: assetID,
 		JobID:       jobID,
 		Event:       "asset_uploaded",
 		Payload:     payload,
+		Traceparent: traceparent,
 	})
 	spanOutbox.End()
 
diff --git a/loadtest/README.md b/loadtest/README.md
new file mode 100644
index 0000000..c189af4
--- /dev/null
+++ b/loadtest/README.md
@@ -0,0 +1,69 @@
+# MPiper Load Harness (k6) — Track 3, Phase 4
+
+Drives the **real** client flow from the host (presign → PUT to MinIO →
+complete), so the whole pipeline — API, outbox relay, Redis, worker, ffmpeg — is
+exercised end-to-end and observable as one trace per asset.
+
+## Install
+
+```bash
+brew install k6          # macOS
+# or see https://grafana.com/docs/k6/latest/set-up/install-k6/
+```
+
+`run.sh` also needs `python3` with the `cryptography` package on the host (used
+only to mint the AES-GCM auth token).
+
+## Prerequisites
+
+Bring the stack up **with the observability overlay** (so Prometheus accepts
+k6's remote-write) and ideally the **loadtest overlay** (CPU-pinned, full
+sampling) so runs are reproducible:
+
+```bash
+docker compose \
+  -f docker-compose.yml \
+  -f docker-compose.observability.yml \
+  -f docker-compose.loadtest.yml \
+  up -d --build
+```
+
+## Run
+
+```bash
+# CLOSED model — fixed VUs hammer the system (find max throughput / saturation)
+./loadtest/run.sh closed --vus 10 --duration 2m
+./loadtest/run.sh closed --vus 20 --duration 3m --ramp
+
+# OPEN model — fixed arrival rate (find the latency knee; watch queue lag grow)
+./loadtest/run.sh open --rate 5/s --duration 3m
+./loadtest/run.sh open --rate 10/s --duration 3m --max-vus 400
+```
+
+Options: `--fixture PATH`, `--base-url URL`, `--no-prometheus`.
+
+## What to watch
+
+- **k6 terminal summary** — client-side request rate, error rate, and the custom
+  trends (`mpiper_presign_latency_ms`, `mpiper_upload_latency_ms`,
+  `mpiper_complete_latency_ms`). Thresholds map to the §4.2 SLOs and fail the run
+  on breach (exit non-zero).
+- **Grafana** (http://localhost:3000) — the Track 3 dashboards: API RED, the
+  app-saturation/USE view (queue depth, in-flight, backlogs), the pipeline
+  funnel, and queue health. In the open model, queue depth climbing while the
+  API stays healthy is the worker bottleneck made visible.
+- **Tempo** — click a latency exemplar on a histogram panel to jump straight to
+  the trace for that asset and see which span dominates.
+
+## Dedup fan-out
+
+The worker dedups by content hash, so identical bytes do almost no work after
+the first asset. The harness appends per-iteration unique bytes **after** the
+JPEG end-of-image marker (decoders ignore trailing bytes), yielding a valid but
+unique-hash image so every iteration costs real work. See `lib.js`.
+
+## Caveat
+
+Local results are **relative**: trust the bottleneck *location* and
+before/after deltas, not absolute throughput. Always record the resource limits
+(from `docker-compose.loadtest.yml`) with each experiment.
diff --git a/loadtest/closed_model.js b/loadtest/closed_model.js
new file mode 100644
index 0000000..f66b37b
--- /dev/null
+++ b/loadtest/closed_model.js
@@ -0,0 +1,42 @@
+// loadtest/closed_model.js
+//
+// CLOSED model: a fixed pool of VUs, each looping the upload flow as fast as
+// the system allows. Good for finding max throughput and the saturation point
+// of the (single-threaded) worker.
+//
+//   VUS=10 DURATION=2m k6 run loadtest/closed_model.js
+//   STAGES=1            -> ramping profile (override via env, see below)
+//
+// Prefer the wrapper: ./loadtest/run.sh closed --vus 10 --duration 2m
+
+import { runUploadFlow, sloThresholds } from "./lib.js";
+
+const VUS = parseInt(__ENV.VUS || "10", 10);
+const DURATION = __ENV.DURATION || "2m";
+
+export const options = {
+  scenarios: {
+    closed: __ENV.RAMP === "1"
+      ? {
+          executor: "ramping-vus",
+          startVUs: 1,
+          stages: [
+            { duration: "30s", target: VUS },
+            { duration: DURATION, target: VUS },
+            { duration: "30s", target: 0 },
+          ],
+          gracefulStop: "30s",
+        }
+      : {
+          executor: "constant-vus",
+          vus: VUS,
+          duration: DURATION,
+          gracefulStop: "30s",
+        },
+  },
+  thresholds: sloThresholds,
+};
+
+export default function () {
+  runUploadFlow();
+}
diff --git a/loadtest/lib.js b/loadtest/lib.js
new file mode 100644
index 0000000..5007d63
--- /dev/null
+++ b/loadtest/lib.js
@@ -0,0 +1,123 @@
+// loadtest/lib.js
+//
+// Shared helpers for the MPiper k6 load harness. Each iteration performs the
+// REAL client flow from the host, exactly like scripts/demo-e2e.sh:
+//
+//   1. POST /api/v1/storage/presign           -> uploadUrl + assetId
+//   2. PUT <uploadUrl> (bytes straight to MinIO at the public endpoint)
+//   3. GET /api/v1/assets/{assetId}/complete  -> enqueues processing
+//
+// Dedup defeat: the worker dedups by content hash, so identical bytes do ~no
+// work after the first asset. We append per-iteration unique bytes AFTER the
+// JPEG end-of-image marker (decoders ignore trailing bytes), giving a valid but
+// unique-hash image so we measure real per-job cost. See track-03 §7.
+
+import http from "k6/http";
+import { check } from "k6";
+import { Trend, Rate, Counter } from "k6/metrics";
+
+// --- Config (host-run; see run.sh) ---------------------------------------
+export const BASE_URL = __ENV.BASE_URL || "http://localhost:5010";
+const TOKEN = __ENV.K6_TOKEN || ""; // minted by run.sh (AES-GCM auth token)
+
+// --- Custom metrics mapped to the SLOs (§4.2) ----------------------------
+export const presignLatency = new Trend("mpiper_presign_latency_ms", true);
+export const uploadLatency = new Trend("mpiper_upload_latency_ms", true);
+export const completeLatency = new Trend("mpiper_complete_latency_ms", true);
+export const flowErrors = new Rate("mpiper_flow_errors");
+export const assetsSubmitted = new Counter("mpiper_assets_submitted");
+
+// --- Fixture (loaded once at init) ---------------------------------------
+// open() resolves relative to this script. 'b' returns an ArrayBuffer.
+const FIXTURE_PATH =
+  __ENV.FIXTURE_PATH || "../worker/tests/test_assets/image.jpg";
+const baseFixture = new Uint8Array(open(FIXTURE_PATH, "b"));
+
+function authHeaders(extra) {
+  return Object.assign({ Authorization: `Bearer ${TOKEN}` }, extra || {});
+}
+
+// Build a unique-hash image: base JPEG + a unique trailer (VU/iter/random).
+function uniqueImageBytes() {
+  const tag = `\nMPIPER-LOADTEST-${__VU}-${__ITER}-${Math.random()}`;
+  // k6's JS runtime has no TextEncoder; the tag is ASCII so charCodeAt suffices.
+  const suffix = new Uint8Array(tag.length);
+  for (let i = 0; i < tag.length; i++) suffix[i] = tag.charCodeAt(i) & 0xff;
+  const out = new Uint8Array(baseFixture.length + suffix.length);
+  out.set(baseFixture, 0);
+  out.set(suffix, baseFixture.length);
+  return out.buffer;
+}
+
+// Run one full presign -> upload -> complete flow. Returns true on success.
+export function runUploadFlow() {
+  const bytes = uniqueImageBytes();
+  const contentType = "image/jpeg";
+
+  // 1. presign
+  const presignRes = http.post(
+    `${BASE_URL}/api/v1/storage/presign`,
+    JSON.stringify({
+      fileName: `loadtest-${__VU}-${__ITER}.jpg`,
+      contentType,
+      size: bytes.byteLength,
+    }),
+    { headers: authHeaders({ "Content-Type": "application/json" }), tags: { step: "presign" } }
+  );
+  presignLatency.add(presignRes.timings.duration);
+  const presignOk = check(presignRes, {
+    "presign 2xx": (r) => r.status >= 200 && r.status < 300,
+  });
+  if (!presignOk) {
+    flowErrors.add(1);
+    return false;
+  }
+
+  const data = presignRes.json("data");
+  if (!data || !data.uploadUrl || !data.assetId) {
+    flowErrors.add(1);
+    return false;
+  }
+
+  // 2. upload bytes straight to object storage (public endpoint)
+  const uploadRes = http.put(data.uploadUrl, bytes, {
+    headers: { "Content-Type": contentType },
+    tags: { step: "upload" },
+  });
+  uploadLatency.add(uploadRes.timings.duration);
+  const uploadOk = check(uploadRes, {
+    "upload 2xx": (r) => r.status >= 200 && r.status < 300,
+  });
+  if (!uploadOk) {
+    flowErrors.add(1);
+    return false;
+  }
+
+  // 3. complete -> enqueue processing
+  const completeRes = http.get(
+    `${BASE_URL}/api/v1/assets/${data.assetId}/complete`,
+    { headers: authHeaders(), tags: { step: "complete" } }
+  );
+  completeLatency.add(completeRes.timings.duration);
+  const completeOk = check(completeRes, {
+    "complete 2xx": (r) => r.status >= 200 && r.status < 300,
+  });
+  if (!completeOk) {
+    flowErrors.add(1);
+    return false;
+  }
+
+  flowErrors.add(0);
+  assetsSubmitted.add(1);
+  return true;
+}
+
+// Thresholds shared by both models, derived from the §4.2 SLOs.
+export const sloThresholds = {
+  // Presign SLO: p95 < 150ms.
+  mpiper_presign_latency_ms: ["p(95)<150"],
+  // End-to-end client errors must stay under 1% (job success SLO > 99%).
+  mpiper_flow_errors: ["rate<0.01"],
+  // Overall check pass rate.
+  checks: ["rate>0.99"],
+};
diff --git a/loadtest/open_model.js b/loadtest/open_model.js
new file mode 100644
index 0000000..ead2bfd
--- /dev/null
+++ b/loadtest/open_model.js
@@ -0,0 +1,35 @@
+// loadtest/open_model.js
+//
+// OPEN model: a fixed arrival rate of new uploads/sec, independent of how fast
+// the system responds. Good for finding the latency knee and watching queue
+// lag grow when arrival rate > service rate — a live demonstration of Little's
+// Law (L = λW). When the worker can't keep up, the Redis stream depth climbs
+// even though the API keeps accepting work.
+//
+//   RATE=5 DURATION=3m k6 run loadtest/open_model.js
+//
+// Prefer the wrapper: ./loadtest/run.sh open --rate 5/s --duration 3m
+
+import { runUploadFlow, sloThresholds } from "./lib.js";
+
+const RATE = parseInt(__ENV.RATE || "5", 10); // iterations/sec
+const DURATION = __ENV.DURATION || "3m";
+const MAX_VUS = parseInt(__ENV.MAX_VUS || String(RATE * 20), 10);
+
+export const options = {
+  scenarios: {
+    open: {
+      executor: "constant-arrival-rate",
+      rate: RATE,
+      timeUnit: "1s",
+      duration: DURATION,
+      preAllocatedVUs: Math.max(10, RATE * 2),
+      maxVUs: MAX_VUS,
+    },
+  },
+  thresholds: sloThresholds,
+};
+
+export default function () {
+  runUploadFlow();
+}
diff --git a/loadtest/run.sh b/loadtest/run.sh
new file mode 100755
index 0000000..d6e58b3
--- /dev/null
+++ b/loadtest/run.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# loadtest/run.sh — host-run wrapper for the MPiper k6 load harness.
+#
+# Usage:
+#   ./loadtest/run.sh closed --vus 10 --duration 2m [--ramp]
+#   ./loadtest/run.sh open   --rate 5/s --duration 3m [--max-vus 200]
+#
+# Options (any model):
+#   --fixture PATH   image fixture to fan out (default worker/tests/test_assets/image.jpg)
+#   --base-url URL   API base (default http://localhost:5010)
+#   --no-prometheus  do not stream k6 metrics to Prometheus remote-write
+#
+# Requires on the host: k6 (brew install k6), python3 with `cryptography`
+# (only to mint the AES-GCM auth token), and the stack up with the
+# observability overlay (so Prometheus remote-write is enabled).
+set -euo pipefail
+
+MODEL="${1:-}"
+if [[ "$MODEL" != "closed" && "$MODEL" != "open" ]]; then
+  echo "usage: $0 <closed|open> [options]" >&2
+  exit 2
+fi
+shift || true
+
+VUS=10
+DURATION=""
+RATE=5
+MAX_VUS=""
+RAMP=0
+FIXTURE="worker/tests/test_assets/image.jpg"
+BASE_URL="http://localhost:5010"
+USE_PROM=1
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --vus) VUS="$2"; shift 2 ;;
+    --duration) DURATION="$2"; shift 2 ;;
+    --rate) RATE="${2%/s}"; shift 2 ;;          # accept "5/s" or "5"
+    --max-vus) MAX_VUS="$2"; shift 2 ;;
+    --ramp) RAMP=1; shift ;;
+    --fixture) FIXTURE="$2"; shift 2 ;;
+    --base-url) BASE_URL="$2"; shift 2 ;;
+    --no-prometheus) USE_PROM=0; shift ;;
+    *) echo "unknown option: $1" >&2; exit 2 ;;
+  esac
+done
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+# --- Mint an auth token (matches scripts/demo-e2e.sh / README) -----------
+ENCRYPTION_KEY="${ENCRYPTION_KEY:-0123456789abcdef0123456789abcdef}"
+K6_TOKEN="$(ENCRYPTION_KEY="$ENCRYPTION_KEY" python3 - <<'PY'
+import base64, os
+from cryptography.hazmat.primitives.ciphers.aead import AESGCM
+key = os.environ["ENCRYPTION_KEY"].encode()
+nonce = os.urandom(12)
+ct = AESGCM(key).encrypt(nonce, b"demo-user", None)
+print(base64.urlsafe_b64encode(nonce + ct).rstrip(b"=").decode())
+PY
+)"
+export K6_TOKEN BASE_URL
+export FIXTURE_PATH="$REPO_ROOT/$FIXTURE"
+
+# --- Stream client metrics to the bundled Prometheus (remote-write) ------
+K6_OUT=()
+if [[ "$USE_PROM" == "1" ]]; then
+  export K6_PROMETHEUS_RW_SERVER_URL="${K6_PROMETHEUS_RW_SERVER_URL:-http://localhost:9090/api/v1/write}"
+  export K6_PROMETHEUS_RW_TREND_STATS="p(95),p(99),avg,max"
+  K6_OUT=(-o experimental-prometheus-rw)
+  echo "k6 → Prometheus remote-write at $K6_PROMETHEUS_RW_SERVER_URL"
+fi
+
+cd "$REPO_ROOT"
+
+if [[ "$MODEL" == "closed" ]]; then
+  export VUS DURATION="${DURATION:-2m}" RAMP
+  echo "closed model: VUS=$VUS DURATION=$DURATION RAMP=$RAMP"
+  exec k6 run "${K6_OUT[@]}" loadtest/closed_model.js
+else
+  export RATE DURATION="${DURATION:-3m}"
+  [[ -n "$MAX_VUS" ]] && export MAX_VUS
+  echo "open model: RATE=${RATE}/s DURATION=$DURATION"
+  exec k6 run "${K6_OUT[@]}" loadtest/open_model.js
+fi
diff --git a/observability/grafana/dashboards/dashboards.yaml b/observability/grafana/dashboards/dashboards.yaml
new file mode 100644
index 0000000..63f23a9
--- /dev/null
+++ b/observability/grafana/dashboards/dashboards.yaml
@@ -0,0 +1,24 @@
+# ============================================================================
+# Grafana Dashboards — provider config
+#
+# IMPORTANT: this file must live in the *dashboards* provisioning directory
+# (/etc/grafana/provisioning/dashboards), NOT the datasources one — Grafana
+# only scans this directory for dashboard providers. It was previously misplaced
+# under datasources/, so no JSON dashboards were ever loaded.
+# ============================================================================
+
+apiVersion: 1
+
+providers:
+  - name: 'MPiper'
+    orgId: 1
+    folder: 'MPiper'
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      # Same dir the JSON dashboards are mounted into; Grafana loads *.json as
+      # dashboards and ignores this provider yaml.
+      path: /etc/grafana/provisioning/dashboards
+      foldersFromFilesStructure: true
diff --git a/observability/grafana/dashboards/mpiper-api-red.json b/observability/grafana/dashboards/mpiper-api-red.json
new file mode 100644
index 0000000..7d90e26
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-api-red.json
@@ -0,0 +1,85 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 0,
+  "id": null,
+  "uid": "mpiper-api-red",
+  "title": "MPiper — API RED",
+  "tags": ["mpiper", "track-3", "red"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-30m", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    {
+      "id": 1,
+      "type": "timeseries",
+      "title": "Request rate (req/s) by route",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sum by (http_route) (rate(mpiper_http_server_request_count_total[5m]))",
+          "legendFormat": "{{http_route}}"
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "Error ratio (5xx) by route",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "percentunit", "max": 1, "min": 0 }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sli:http_error_ratio:ratio_rate5m",
+          "legendFormat": "{{http_route}}"
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "Duration p95 by route (exemplars → Tempo)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sli:http_request_latency_seconds:p95",
+          "legendFormat": "{{http_route}} p95",
+          "exemplar": true
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Presign p95 (SLO < 150ms)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 0.15 }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "targets": [
+        { "refId": "A", "expr": "sli:presign_latency_seconds:p95", "legendFormat": "presign p95" }
+      ]
+    }
+  ]
+}
diff --git a/observability/grafana/dashboards/mpiper-experiment-overview.json b/observability/grafana/dashboards/mpiper-experiment-overview.json
new file mode 100644
index 0000000..49d75ac
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-experiment-overview.json
@@ -0,0 +1,212 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 1,
+  "id": null,
+  "uid": "mpiper-experiment-overview",
+  "title": "MPiper — Experiment Overview (Track 3)",
+  "description": "One pane for load experiments: k6 client load alongside server-side API/worker/queue metrics and the SLO summary.",
+  "tags": ["mpiper", "track-3", "experiment", "load"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-1h", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    { "id": 100, "type": "row", "title": "Load — k6 client (host)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } },
+    {
+      "id": 1, "type": "stat", "title": "Active VUs", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 0, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "thresholds" } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "max(k6_vus)", "legendFormat": "VUs" } ]
+    },
+    {
+      "id": 2, "type": "timeseries", "title": "Throughput — iterations/s & assets submitted/s", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 12, "x": 6, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(k6_iterations_total[1m]))", "legendFormat": "iterations/s" },
+        { "refId": "B", "expr": "sum(rate(k6_mpiper_assets_submitted_total[1m]))", "legendFormat": "assets submitted/s" },
+        { "refId": "C", "expr": "sum(rate(k6_http_reqs_total[1m]))", "legendFormat": "http reqs/s" }
+      ]
+    },
+    {
+      "id": 3, "type": "stat", "title": "Client error rate (SLO < 1%)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 18, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.01 } ] } }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "max(k6_mpiper_flow_errors_rate)", "legendFormat": "flow errors" },
+        { "refId": "B", "expr": "max(k6_http_req_failed_rate)", "legendFormat": "http failed" }
+      ]
+    },
+    {
+      "id": 4, "type": "timeseries", "title": "Client step latency p95 (ms)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "max(k6_mpiper_presign_latency_ms_p95)", "legendFormat": "presign p95" },
+        { "refId": "B", "expr": "max(k6_mpiper_upload_latency_ms_p95)", "legendFormat": "upload p95" },
+        { "refId": "C", "expr": "max(k6_mpiper_complete_latency_ms_p95)", "legendFormat": "complete p95" }
+      ]
+    },
+    {
+      "id": 5, "type": "timeseries", "title": "Client HTTP duration p95 / p99 (ms)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "max(k6_http_req_duration_p95)", "legendFormat": "http_req_duration p95" },
+        { "refId": "B", "expr": "max(k6_http_req_duration_p99)", "legendFormat": "http_req_duration p99" }
+      ]
+    },
+
+    { "id": 101, "type": "row", "title": "API — server RED", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 } },
+    {
+      "id": 6, "type": "timeseries", "title": "Request rate by route (req/s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 8, "x": 0, "y": 15 },
+      "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sum by (http_route) (rate(mpiper_http_server_request_count_total[1m]))", "legendFormat": "{{http_route}}" } ]
+    },
+    {
+      "id": 7, "type": "timeseries", "title": "Duration p95 by route (exemplars → Tempo)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 8, "x": 8, "y": 15 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:http_request_latency_seconds:p95", "legendFormat": "{{http_route}}", "exemplar": true } ]
+    },
+    {
+      "id": 8, "type": "timeseries", "title": "In-flight requests & DB query rate", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 8, "x": 16, "y": 15 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(mpiper_http_server_active_requests)", "legendFormat": "in-flight" },
+        { "refId": "B", "expr": "sum(rate(mpiper_db_query_total[1m]))", "legendFormat": "db queries/s" }
+      ]
+    },
+
+    { "id": 102, "type": "row", "title": "Worker — saturation & service rate (μ)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 } },
+    {
+      "id": 9, "type": "timeseries", "title": "Service rate μ: consumed vs completed (jobs/s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_mpiper_queue_message_consumed_total[2m]))", "legendFormat": "consumed/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_job_processing_success_total[2m]))", "legendFormat": "completed/s (μ)" }
+      ]
+    },
+    {
+      "id": 10, "type": "timeseries", "title": "Asset processing p95 by type (s) — exemplars", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "histogram_quantile(0.95, sum by (le, asset_type) (rate(mpiper_mpiper_asset_processing_duration_seconds_bucket[5m])))", "legendFormat": "{{asset_type}} p95", "exemplar": true } ]
+    },
+    {
+      "id": 11, "type": "timeseries", "title": "Asset processing mean (s) by type", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sum by (asset_type)(rate(mpiper_mpiper_asset_processing_duration_seconds_sum[5m])) / sum by (asset_type)(rate(mpiper_mpiper_asset_processing_duration_seconds_count[5m]))", "legendFormat": "{{asset_type}} mean" } ]
+    },
+    {
+      "id": 12, "type": "timeseries", "title": "Job processing p95 (s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_mpiper_job_processing_duration_seconds_bucket[5m])))", "legendFormat": "job p95" } ]
+    },
+
+    { "id": 103, "type": "row", "title": "Pipeline funnel & backlogs", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 } },
+    {
+      "id": 13, "type": "timeseries", "title": "Funnel: uploaded → ready / failed (per s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_asset_upload_total[2m]))", "legendFormat": "uploaded/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_asset_processing_success_total[2m]))", "legendFormat": "ready/s" },
+        { "refId": "C", "expr": "sum(rate(mpiper_mpiper_asset_processing_failed_total[2m]))", "legendFormat": "failed/s" }
+      ]
+    },
+    {
+      "id": 14, "type": "timeseries", "title": "Queue depth (Redis stream length)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 },
+      "fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 20 } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "max(mpiper_queue_depth)", "legendFormat": "queue depth" } ]
+    },
+    {
+      "id": 15, "type": "timeseries", "title": "Backlogs: outbox & webhook pending", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "max(mpiper_outbox_pending)", "legendFormat": "outbox pending" },
+        { "refId": "B", "expr": "max(mpiper_webhook_pending)", "legendFormat": "webhook pending" }
+      ]
+    },
+
+    { "id": 104, "type": "row", "title": "Queue / transport", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 } },
+    {
+      "id": 16, "type": "timeseries", "title": "Publish vs consume (msg/s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 46 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_queue_message_published_total[2m]))", "legendFormat": "published/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_queue_message_consumed_total[2m]))", "legendFormat": "consumed/s" }
+      ]
+    },
+    {
+      "id": 17, "type": "timeseries", "title": "Queue wait p95 & outbox relay lag p95 (s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 46 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sli:queue_wait_seconds:p95", "legendFormat": "queue wait p95 (publish-side)" },
+        { "refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_outbox_relay_lag_seconds_bucket[5m])))", "legendFormat": "outbox relay lag p95" }
+      ]
+    },
+
+    { "id": 105, "type": "row", "title": "SLO summary (vs targets)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 53 } },
+    {
+      "id": 18, "type": "stat", "title": "Presign p95 (< 150ms)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 0, "y": 54 },
+      "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.15 } ] } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:presign_latency_seconds:p95" } ]
+    },
+    {
+      "id": 19, "type": "stat", "title": "Image ready p95 (< 5s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 6, "y": 54 },
+      "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 5 } ] } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:image_ready_latency_seconds:p95" } ]
+    },
+    {
+      "id": 20, "type": "stat", "title": "Queue wait p95 (< 2s)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 12, "y": 54 },
+      "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 2 } ] } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:queue_wait_seconds:p95" } ]
+    },
+    {
+      "id": 21, "type": "stat", "title": "Job success rate (> 99%)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 6, "w": 6, "x": 18, "y": 54 },
+      "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] } }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:job_success_ratio:ratio_rate5m" } ]
+    },
+
+    { "id": 106, "type": "row", "title": "Database (connection pool & queries)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 } },
+    {
+      "id": 22, "type": "timeseries", "title": "Connection pool (in-use / idle / open vs max)", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 0, "y": 61 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "mpiper_db_connections_active", "legendFormat": "in-use" },
+        { "refId": "B", "expr": "mpiper_db_connections_idle", "legendFormat": "idle" },
+        { "refId": "C", "expr": "mpiper_db_connections_open", "legendFormat": "open" },
+        { "refId": "D", "expr": "mpiper_db_connections_max_open", "legendFormat": "max open" }
+      ]
+    },
+    {
+      "id": 23, "type": "timeseries", "title": "DB query rate (q/s), p95 latency & pool waits", "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 7, "w": 12, "x": 12, "y": 61 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [
+        { "matcher": { "id": "byName", "options": "query p95" }, "properties": [ { "id": "unit", "value": "s" } ] }
+      ] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_db_query_total[2m]))", "legendFormat": "queries/s" },
+        { "refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_db_query_duration_seconds_bucket[5m])))", "legendFormat": "query p95" },
+        { "refId": "C", "expr": "rate(mpiper_db_connections_wait_count[5m])", "legendFormat": "pool waits/s" }
+      ]
+    }
+  ]
+}
diff --git a/observability/grafana/dashboards/mpiper-metrics.json b/observability/grafana/dashboards/mpiper-metrics.json
index e24f49f..edb9e6c 100644
--- a/observability/grafana/dashboards/mpiper-metrics.json
+++ b/observability/grafana/dashboards/mpiper-metrics.json
@@ -38,7 +38,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -87,7 +87,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_http_server_request_count_total[5m])",
           "refId": "A"
@@ -99,7 +99,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -152,9 +152,9 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_bucket[5m]))",
+          "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))",
           "refId": "A"
         }
       ],
@@ -164,7 +164,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -238,7 +238,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "mpiper_http_server_active_requests",
           "refId": "A"
@@ -250,7 +250,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -299,9 +299,9 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"5..\"}[5m]) / rate(mpiper_http_server_request_count_total[5m])",
+          "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"5..\"}[5m]) / rate(mpiper_http_server_request_count_total[5m])",
           "refId": "A"
         }
       ],
@@ -324,7 +324,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -398,27 +398,27 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, rate(mpiper_http_server_request_duration_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))",
           "legendFormat": "p50",
           "refId": "A"
         },
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, rate(mpiper_http_server_request_duration_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))",
           "legendFormat": "p95",
           "refId": "B"
         },
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_bucket[5m]))",
+          "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))",
           "legendFormat": "p99",
           "refId": "C"
         }
@@ -429,7 +429,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -503,27 +503,27 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"2..\"}[5m])",
+          "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"2..\"}[5m])",
           "legendFormat": "2xx - {{method}} {{path}}",
           "refId": "A"
         },
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"4..\"}[5m])",
+          "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"4..\"}[5m])",
           "legendFormat": "4xx - {{method}} {{path}}",
           "refId": "B"
         },
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
-          "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"5..\"}[5m])",
+          "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"5..\"}[5m])",
           "legendFormat": "5xx - {{method}} {{path}}",
           "refId": "C"
         }
@@ -547,7 +547,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -621,7 +621,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_asset_upload_total[5m])",
           "legendFormat": "Uploads",
@@ -630,7 +630,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_asset_processing_success_total[5m])",
           "legendFormat": "Success",
@@ -639,7 +639,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_asset_processing_failed_total[5m])",
           "legendFormat": "Failed",
@@ -652,7 +652,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -726,7 +726,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.95, rate(mpiper_asset_upload_duration_bucket[5m]))",
           "legendFormat": "Upload p95",
@@ -735,7 +735,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.95, rate(mpiper_asset_processing_duration_bucket[5m]))",
           "legendFormat": "Processing p95",
@@ -748,7 +748,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -791,7 +791,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.50, rate(mpiper_asset_size_bucket[5m]))",
           "legendFormat": "p50",
@@ -800,7 +800,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.95, rate(mpiper_asset_size_bucket[5m]))",
           "legendFormat": "p95",
@@ -809,7 +809,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.99, rate(mpiper_asset_size_bucket[5m]))",
           "legendFormat": "p99",
@@ -835,7 +835,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -909,7 +909,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.50, rate(mpiper_db_query_duration_bucket[5m]))",
           "legendFormat": "p50",
@@ -918,7 +918,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.95, rate(mpiper_db_query_duration_bucket[5m]))",
           "legendFormat": "p95",
@@ -927,7 +927,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "histogram_quantile(0.99, rate(mpiper_db_query_duration_bucket[5m]))",
           "legendFormat": "p99",
@@ -940,7 +940,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -1014,7 +1014,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "mpiper_db_connections_active",
           "legendFormat": "Active",
@@ -1023,7 +1023,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "mpiper_db_connections_idle",
           "legendFormat": "Idle",
@@ -1036,7 +1036,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -1129,7 +1129,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_db_query_success_total[5m])",
           "legendFormat": "Success",
@@ -1138,7 +1138,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "expr": "rate(mpiper_db_query_failed_total[5m])",
           "legendFormat": "Errors",
diff --git a/observability/grafana/dashboards/mpiper-pipeline-funnel.json b/observability/grafana/dashboards/mpiper-pipeline-funnel.json
new file mode 100644
index 0000000..c2b254a
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-pipeline-funnel.json
@@ -0,0 +1,90 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 0,
+  "id": null,
+  "uid": "mpiper-pipeline-funnel",
+  "title": "MPiper — Pipeline Funnel & Stage Latency",
+  "tags": ["mpiper", "track-3", "funnel"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-30m", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    {
+      "id": 1,
+      "type": "timeseries",
+      "title": "Funnel rate: uploaded → processed (success/failed)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_asset_upload_total[5m]))", "legendFormat": "uploaded/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_asset_processing_success_total[5m]))", "legendFormat": "ready/s" },
+        { "refId": "C", "expr": "sum(rate(mpiper_mpiper_asset_processing_failed_total[5m]))", "legendFormat": "failed/s" }
+      ]
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Image ready p95 (SLO < 5s)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 8 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 5 } ] }
+        },
+        "overrides": []
+      },
+      "targets": [ { "refId": "A", "expr": "sli:image_ready_latency_seconds:p95", "legendFormat": "image p95" } ]
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "Video ready p95 (SLO < 60s)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 8 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 60 } ] }
+        },
+        "overrides": []
+      },
+      "targets": [ { "refId": "A", "expr": "sli:video_ready_latency_seconds:p95", "legendFormat": "video p95" } ]
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Job success rate (SLO > 99%)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit", "min": 0, "max": 1,
+          "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] }
+        },
+        "overrides": []
+      },
+      "targets": [ { "refId": "A", "expr": "sli:job_success_ratio:ratio_rate5m", "legendFormat": "success rate" } ]
+    },
+    {
+      "id": 5,
+      "type": "timeseries",
+      "title": "Worker asset processing p95 by type (exemplars → Tempo)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "histogram_quantile(0.95, sum by (le, asset_type) (rate(mpiper_mpiper_asset_processing_duration_seconds_bucket[5m])))",
+          "legendFormat": "{{asset_type}} p95",
+          "exemplar": true
+        }
+      ]
+    }
+  ]
+}
diff --git a/observability/grafana/dashboards/mpiper-queue-health.json b/observability/grafana/dashboards/mpiper-queue-health.json
new file mode 100644
index 0000000..2523550
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-queue-health.json
@@ -0,0 +1,58 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 0,
+  "id": null,
+  "uid": "mpiper-queue-health",
+  "title": "MPiper — Queue Health",
+  "tags": ["mpiper", "track-3", "queue"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-30m", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    {
+      "id": 1,
+      "type": "timeseries",
+      "title": "Stream length (queue depth)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "max(mpiper_queue_depth)", "legendFormat": "queue depth" } ]
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "Publish vs consume rate (msg/s)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_queue_message_published_total[5m]))", "legendFormat": "published/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_mpiper_queue_message_consumed_total[5m]))", "legendFormat": "consumed/s" }
+      ]
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "Queue wait p95 (publish-side proxy; authoritative in Tempo)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [ { "refId": "A", "expr": "sli:queue_wait_seconds:p95", "legendFormat": "queue wait p95" } ]
+    },
+    {
+      "id": 4,
+      "type": "timeseries",
+      "title": "Webhook delivery p95 (SLO < 10s) + pending",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sli:webhook_delivery_latency_seconds:p95", "legendFormat": "delivery p95" },
+        { "refId": "B", "expr": "sli:webhook_pending:current", "legendFormat": "pending (count)" }
+      ]
+    }
+  ]
+}
diff --git a/observability/grafana/dashboards/mpiper-worker-use.json b/observability/grafana/dashboards/mpiper-worker-use.json
new file mode 100644
index 0000000..0ab7fd2
--- /dev/null
+++ b/observability/grafana/dashboards/mpiper-worker-use.json
@@ -0,0 +1,89 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 0,
+  "id": null,
+  "uid": "mpiper-worker-use",
+  "title": "MPiper — Worker / App Saturation (USE)",
+  "tags": ["mpiper", "track-3", "use"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "time": { "from": "now-30m", "to": "now" },
+  "refresh": "10s",
+  "templating": { "list": [] },
+  "panels": [
+    {
+      "id": 1,
+      "type": "row",
+      "title": "Saturation (app-level: no per-container CPU by design)",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "Queue depth (Redis stream length)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sli:queue_depth:current", "legendFormat": "queue depth" }
+      ]
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "In-flight HTTP requests",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(mpiper_http_server_active_requests)", "legendFormat": "active requests" }
+      ]
+    },
+    {
+      "id": 4,
+      "type": "timeseries",
+      "title": "Outbox relay lag p95 (oldest pending age)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_outbox_relay_lag_seconds_bucket[5m])))",
+          "legendFormat": "relay lag p95"
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "type": "timeseries",
+      "title": "Backlogs: outbox + webhook pending",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sli:outbox_pending:current", "legendFormat": "outbox pending" },
+        { "refId": "B", "expr": "sli:webhook_pending:current", "legendFormat": "webhook pending" }
+      ]
+    },
+    {
+      "id": 6,
+      "type": "row",
+      "title": "Errors",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }
+    },
+    {
+      "id": 7,
+      "type": "timeseries",
+      "title": "Queue message failures + outbox publish failures (rate)",
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "targets": [
+        { "refId": "A", "expr": "sum(rate(mpiper_queue_message_failed_total[5m]))", "legendFormat": "queue failures/s" },
+        { "refId": "B", "expr": "sum(rate(mpiper_outbox_publish_failures_total[5m]))", "legendFormat": "outbox publish failures/s" }
+      ]
+    }
+  ]
+}
diff --git a/observability/grafana/datasources/dashboards.yml b/observability/grafana/datasources/dashboards.yml
deleted file mode 100644
index 3e5981b..0000000
--- a/observability/grafana/datasources/dashboards.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-# ============================================================================
-# Grafana Dashboards - Auto-provisioning
-# ============================================================================
-
-apiVersion: 1
-
-providers:
-  - name: 'Default'
-    orgId: 1
-    folder: 'MPiper'
-    type: file
-    disableDeletion: false
-    updateIntervalSeconds: 10
-    allowUiUpdates: true
-    options:
-      path: /etc/grafana/provisioning/dashboards
-      foldersFromFilesStructure: true
diff --git a/observability/grafana/datasources/datasources.yml b/observability/grafana/datasources/datasources.yml
index 1d35862..a8514be 100644
--- a/observability/grafana/datasources/datasources.yml
+++ b/observability/grafana/datasources/datasources.yml
@@ -57,8 +57,11 @@ datasources:
     url: http://loki:3100
     jsonData:
       derivedFields:
+        # Match trace_id across formats: worker (trace_id=abc), API JSON
+        # ("trace_id":"abc"), and API dev pretty (trace_id: abc). \W+ swallows
+        # the separator (=, ":", ": ") between the key and the hex value.
         - datasourceUid: tempo
-          matcherRegex: "trace_id=(\\w+)"
+          matcherRegex: "trace_id\\W+(\\w+)"
           name: TraceID
           url: '$${__value.raw}'
     editable: true
diff --git a/observability/prometheus.rules.yml b/observability/prometheus.rules.yml
new file mode 100644
index 0000000..005f54a
--- /dev/null
+++ b/observability/prometheus.rules.yml
@@ -0,0 +1,126 @@
+# ============================================================================
+# MPiper — SLO recording rules (Track 3, Phase 3)
+#
+# Encodes the SLIs from track-03-observability-and-load.md §4.2 as recording
+# rules so each renders directly on a dashboard against its target.
+#
+# Metric naming convention
+# -------------------------
+# App telemetry is OTLP → the collector's Prometheus exporter, which is
+# configured with `namespace: mpiper` (see observability/otel-collector.yml).
+# OTel instrument names have dots replaced by underscores and the unit appended:
+#
+#   Go API   http.server.request.duration (s)  -> mpiper_http_server_request_duration_seconds_{bucket,sum,count}
+#   Go API   asset.processing.success (counter) -> mpiper_asset_processing_success_total
+#   Worker   mpiper.job.processing.success      -> mpiper_mpiper_job_processing_success_total
+#            (worker instruments already carry a `mpiper.` prefix, so the
+#             collector namespace yields a double `mpiper_mpiper_` prefix)
+#
+# If your collector build sets `add_metric_suffixes: false` or a different
+# namespace, adjust the series names below. Validate with:
+#   promtool check rules observability/prometheus.rules.yml
+# and against live series in Prometheus → Status → Targets after a load run.
+# ============================================================================
+
+groups:
+  - name: mpiper_slo
+    interval: 15s
+    rules:
+      # --- Presign latency: p95 of POST /api/v1/storage/presign  (target < 150ms)
+      - record: sli:presign_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_http_server_request_duration_seconds_bucket{http_route="/api/v1/storage/presign"}[5m])
+            )
+          )
+
+      # --- API overall request latency p95 per route (RED "Duration")
+      - record: sli:http_request_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le, http_route) (
+              rate(mpiper_http_server_request_duration_seconds_bucket[5m])
+            )
+          )
+
+      # --- API error ratio per route (RED "Errors"): 5xx / all
+      - record: sli:http_error_ratio:ratio_rate5m
+        expr: >
+          sum by (http_route) (
+            rate(mpiper_http_server_request_count_total{http_status_code=~"5.."}[5m])
+          )
+          /
+          clamp_min(
+            sum by (http_route) (rate(mpiper_http_server_request_count_total[5m])),
+            1
+          )
+
+      # --- Image ready latency: p95 of worker asset processing for images.
+      # Server-side processing proxy for complete -> ready; queue wait is added
+      # on top and is best read from the trace waterfall (enqueue -> consume).
+      - record: sli:image_ready_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_mpiper_asset_processing_duration_seconds_bucket{asset_type="image"}[5m])
+            )
+          )
+
+      # --- Video ready latency: p95 of worker asset processing for videos  (target < 60s)
+      - record: sli:video_ready_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_mpiper_asset_processing_duration_seconds_bucket{asset_type="video"}[5m])
+            )
+          )
+
+      # --- Queue wait: p95 of the producer-side processing lag histogram.
+      # The authoritative enqueue -> consume gap is visible per-asset in Tempo;
+      # this metric approximates the publish-side contribution.
+      - record: sli:queue_wait_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_queue_processing_lag_seconds_bucket[5m])
+            )
+          )
+
+      # --- Job success rate: done / (done + failed)  (target > 99%)
+      # `or vector(0)` guards the empty-vector pitfall: with zero failures the
+      # failed series does not exist, and a bare A + B would yield no result.
+      - record: sli:job_success_ratio:ratio_rate5m
+        expr: >
+          (sum(rate(mpiper_mpiper_job_processing_success_total[5m])) or vector(0))
+          /
+          clamp_min(
+            (sum(rate(mpiper_mpiper_job_processing_success_total[5m])) or vector(0))
+            + (sum(rate(mpiper_mpiper_job_processing_failed_total[5m])) or vector(0)),
+            1
+          )
+
+      # --- Webhook delivery latency: p95 of delivery HTTP calls  (target < 10s)
+      - record: sli:webhook_delivery_latency_seconds:p95
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (le) (
+              rate(mpiper_webhook_delivery_duration_seconds_bucket[5m])
+            )
+          )
+
+      # --- Queue health snapshots (gauges already exported by the app)
+      - record: sli:queue_depth:current
+        expr: max(mpiper_queue_depth)
+
+      - record: sli:outbox_pending:current
+        expr: max(mpiper_outbox_pending)
+
+      - record: sli:webhook_pending:current
+        expr: max(mpiper_webhook_pending)
diff --git a/observability/prometheus.yml b/observability/prometheus.yml
index a628c78..bf75a95 100644
--- a/observability/prometheus.yml
+++ b/observability/prometheus.yml
@@ -9,6 +9,11 @@ global:
     cluster: 'mpiper-local'
     environment: 'development'
 
+# SLO recording rules (Track 3, Phase 3). Mounted into the container at
+# /etc/prometheus/ alongside this file by docker-compose.observability.yml.
+rule_files:
+  - /etc/prometheus/prometheus.rules.yml
+
 # Alertmanager configuration (optional)
 # alerting:
 #   alertmanagers:
diff --git a/worker/consumer/consumer.py b/worker/consumer/consumer.py
index c66c990..375daa9 100644
--- a/worker/consumer/consumer.py
+++ b/worker/consumer/consumer.py
@@ -28,16 +28,21 @@
 from __future__ import annotations
 
 import time
+from contextlib import nullcontext
 from typing import Dict
 
 import redis
 from redis.exceptions import ResponseError
+from opentelemetry import trace
+from opentelemetry.propagate import extract
 
 from worker.consumer.config import WorkerConfig
 from worker.consumer.db import PgPool
 from worker.processing.processor import RetryableException, process_asset_dispatch
 from worker.storage.base import StorageX
 from worker.utils.logger import get_logger
+from worker.utils.tracing import get_tracer
+from worker.utils import metrics as wm
 from worker.webhooks import insert_webhook_deliveries
 
 logger = get_logger(__name__)
@@ -159,6 +164,7 @@ def consume(self, consumer_name: str) -> bool:
         # Response format: [(stream_name, [(msg_id, {field: value}), ...])]
         _, messages = resp[0]
         msg_id, fields = messages[0]
+        wm.record_consume()
 
         try:
             # Normalize fields to a dict
@@ -180,14 +186,25 @@ def consume(self, consumer_name: str) -> bool:
             job_id = payload.get("job_id")
             asset_id = payload.get("asset_id")
 
-            if job_id:
-                self._handle_job(job_id, msg_id)
-            elif asset_id:
-                self._handle_asset_message(asset_id, msg_id)
-            else:
-                logger.error("message missing job_id and asset_id: %s", payload)
-                # Acknowledge to remove the malformed message from the stream.
-                self.redis.xack(self.cfg.stream_name, self.cfg.consumer_group, msg_id)
+            # Extract the producer trace context (injected by the Go relay) and
+            # continue the trace here. traceparent may be a top-level stream
+            # field or have been merged in from `body` above. We start the
+            # consume span as a CHILD of the producer context (keeps the Tempo
+            # waterfall readable) AND attach a link to it — a link is the correct
+            # primitive for queue fan-in, where one consumer may join many
+            # producers. The gap between the producer's enqueue span and this
+            # span is the queue wait time.
+            with self._consume_span(payload, msg_id, job_id, asset_id):
+                if job_id:
+                    self._handle_job(job_id, msg_id)
+                elif asset_id:
+                    self._handle_asset_message(asset_id, msg_id)
+                else:
+                    logger.error("message missing job_id and asset_id: %s", payload)
+                    # Acknowledge to remove the malformed message from the stream.
+                    self.redis.xack(
+                        self.cfg.stream_name, self.cfg.consumer_group, msg_id
+                    )
         except Exception:
             logger.exception("unhandled exception while processing message %s", msg_id)
             # Do not ack the message so it remains in the pending entries list
@@ -195,6 +212,43 @@ def consume(self, consumer_name: str) -> bool:
 
         return True
 
+    def _consume_span(self, payload, msg_id, job_id, asset_id):
+        """Start the worker.consume span continuing the producer trace.
+
+        Returns a context manager. When tracing is not initialised (telemetry
+        failed at startup) this is a no-op so message processing is unaffected.
+        """
+        tracer = get_tracer()
+        if tracer is None:
+            return nullcontext()
+
+        carrier = {
+            k: payload[k]
+            for k in ("traceparent", "tracestate", "baggage")
+            if k in payload
+        }
+        parent_ctx = extract(carrier)
+        producer_sc = trace.get_current_span(parent_ctx).get_span_context()
+        links = [trace.Link(producer_sc)] if producer_sc.is_valid else None
+
+        attrs = {
+            "messaging.system": "redis",
+            "messaging.destination.name": self.cfg.stream_name,
+            "messaging.message.id": msg_id,
+        }
+        if job_id:
+            attrs["job_id"] = str(job_id)
+        if asset_id:
+            attrs["asset_id"] = str(asset_id)
+
+        return tracer.start_as_current_span(
+            "worker.consume",
+            context=parent_ctx,
+            kind=trace.SpanKind.CONSUMER,
+            links=links,
+            attributes=attrs,
+        )
+
     def _handle_job(self, job_id: int, msg_id: str) -> None:
         """Load the job row, mark it in-progress, run processing, and finalize.
 
@@ -233,10 +287,12 @@ def _handle_job(self, job_id: int, msg_id: str) -> None:
             conn.commit()
 
         # Run the processing outside the DB transaction.
+        job_start = time.time()
         try:
             process_asset_dispatch(asset_id, self.pg, self.storage, self.cfg)
         except Exception as exc:
             logger.exception("processing failed for job=%s asset=%s", job_id, asset_id)
+            wm.record_job(success=False, duration_seconds=time.time() - job_start)
 
             with self.pg.get_pg_conn() as conn:
                 cur = conn.cursor()
@@ -285,6 +341,8 @@ def _handle_job(self, job_id: int, msg_id: str) -> None:
             insert_webhook_deliveries(cur, asset_id, job_id, "job.done")
             conn.commit()
 
+        wm.record_job(success=True, duration_seconds=time.time() - job_start)
+
         # Acknowledge the Redis stream message.
         self.redis.xack(self.cfg.stream_name, self.cfg.consumer_group, msg_id)
 
diff --git a/worker/consumer/main.py b/worker/consumer/main.py
index 673298c..c94bfff 100644
--- a/worker/consumer/main.py
+++ b/worker/consumer/main.py
@@ -10,6 +10,7 @@
 from worker.consumer.migrations import run_migrations
 from worker.storage import get_storage
 from worker.utils import metrics as worker_metrics
+from worker.utils import tracing as worker_tracing
 
 logger = logging.getLogger(__name__)
 
@@ -19,6 +20,33 @@ def main():
     logger.info("Starting worker consumer...")
 
     cfg = get_config()
+
+    # Initialise telemetry before anything else so startup is observable.
+    # NOTE: init_metrics() was previously never called — worker OTel metrics
+    # were defined but never wired up. We initialise both tracing and metrics
+    # here from the same OtelConfig so they share endpoint/resource/lifecycle.
+    otel = cfg.otel
+    try:
+        worker_tracing.init_tracing(
+            service_name=otel.service_name,
+            service_version=otel.service_version,
+            endpoint=otel.endpoint,
+            deployment_env=otel.deployment_env,
+            instance_id=otel.instance_id,
+            tls_insecure=otel.tls_insecure,
+        )
+        worker_metrics.init_metrics(
+            service_name=otel.service_name,
+            service_version=otel.service_version,
+            endpoint=otel.endpoint,
+            deployment_env=otel.deployment_env,
+            instance_id=otel.instance_id,
+            tls_insecure=otel.tls_insecure,
+        )
+    except Exception:
+        # Telemetry must never prevent the worker from processing jobs.
+        logger.exception("failed to initialise telemetry; continuing without it")
+
     storage = get_storage(cfg)
     password = quote_plus(cfg.database.password)
 
@@ -60,8 +88,9 @@ def _term(signum, frame):
             time.sleep(1)
 
     logger.info("exiting")
-    
-    # Shutdown metrics on exit
+
+    # Shutdown telemetry on exit (flush pending spans + metrics).
+    worker_tracing.shutdown_tracing()
     worker_metrics.shutdown_metrics()
 
 
diff --git a/worker/processing/images.py b/worker/processing/images.py
index e4c03f5..83171ed 100644
--- a/worker/processing/images.py
+++ b/worker/processing/images.py
@@ -3,7 +3,10 @@
 import logging
 import os
 
+from opentelemetry import trace
+
 logger = logging.getLogger("images")
+tracer = trace.get_tracer("worker.processing.images")
 
 IMAGE_VARIANTS = [
     {
@@ -68,40 +71,48 @@ def process_image_file(
             role = v["role"]
             logger.info("generating image variant %s for asset %s", role, asset_id)
 
-            if v["crop"]:
-                out_img = ImageOps.fit(
-                    img,
-                    (v["width"], v["height"]),
-                    Image.LANCZOS,
-                    centering=(0.5, 0.5),
-                )
-            else:
-                target_w = v["width"] or src_width
-                ratio = target_w / float(src_width)
-                target_h = int(src_height * ratio)
-                out_img = img.resize((target_w, target_h), Image.LANCZOS)
-
-            data = encode_image(out_img, v["format"], v["quality"])
-
-            key = f"media/processed/{asset_id}/{role}.{v['format']}"
-            storage.upload_bytes(key, data, content_type=f"image/{v['format']}")
-            url = storage.public_url(key)
-
-            # Upsert into variants.image (PK is asset_id + role)
-            with pg_pool.get_pg_conn() as conn:
-                conn.cursor().execute(
-                    """
-                    INSERT INTO variants.image (asset_id, url, role, width, height, size_bytes, format)
-                    VALUES (%s, %s, %s, %s, %s, %s, %s)
-                    ON CONFLICT (asset_id, role) DO UPDATE SET
-                        url = EXCLUDED.url,
-                        width = EXCLUDED.width,
-                        height = EXCLUDED.height,
-                        size_bytes = EXCLUDED.size_bytes,
-                        format = EXCLUDED.format
-                    """,
-                    (asset_id, url, role, out_img.width, out_img.height, len(data), v["format"]),
-                )
+            with tracer.start_as_current_span("image.variant") as span:
+                span.set_attribute("asset_id", asset_id)
+                span.set_attribute("variant.role", role)
+                span.set_attribute("variant.format", v["format"])
+
+                if v["crop"]:
+                    out_img = ImageOps.fit(
+                        img,
+                        (v["width"], v["height"]),
+                        Image.LANCZOS,
+                        centering=(0.5, 0.5),
+                    )
+                else:
+                    target_w = v["width"] or src_width
+                    ratio = target_w / float(src_width)
+                    target_h = int(src_height * ratio)
+                    out_img = img.resize((target_w, target_h), Image.LANCZOS)
+
+                data = encode_image(out_img, v["format"], v["quality"])
+                span.set_attribute("variant.width", out_img.width)
+                span.set_attribute("variant.height", out_img.height)
+                span.set_attribute("variant.size_bytes", len(data))
+
+                key = f"media/processed/{asset_id}/{role}.{v['format']}"
+                storage.upload_bytes(key, data, content_type=f"image/{v['format']}")
+                url = storage.public_url(key)
+
+                # Upsert into variants.image (PK is asset_id + role)
+                with pg_pool.get_pg_conn() as conn:
+                    conn.cursor().execute(
+                        """
+                        INSERT INTO variants.image (asset_id, url, role, width, height, size_bytes, format)
+                        VALUES (%s, %s, %s, %s, %s, %s, %s)
+                        ON CONFLICT (asset_id, role) DO UPDATE SET
+                            url = EXCLUDED.url,
+                            width = EXCLUDED.width,
+                            height = EXCLUDED.height,
+                            size_bytes = EXCLUDED.size_bytes,
+                            format = EXCLUDED.format
+                        """,
+                        (asset_id, url, role, out_img.width, out_img.height, len(data), v["format"]),
+                    )
 
     # Mark asset ready
     with pg_pool.get_pg_conn() as conn:
diff --git a/worker/processing/processor.py b/worker/processing/processor.py
index 9e6244f..5166fce 100644
--- a/worker/processing/processor.py
+++ b/worker/processing/processor.py
@@ -1,6 +1,9 @@
 import os
+import time
 from enum import Enum
 
+from opentelemetry import trace
+
 from worker.consumer.config import WorkerConfig
 from worker.consumer.db import PgPool
 from worker.processing.images import process_image_file
@@ -8,8 +11,10 @@
 from worker.storage.base import StorageX
 from worker.utils.hash import compute_file_hash
 from worker.utils.logger import get_logger
+from worker.utils import metrics as wm
 
 logger = get_logger(__name__)
+tracer = trace.get_tracer("worker.processing")
 
 
 class AssetStatus(Enum):
@@ -133,88 +138,119 @@ def process_asset_dispatch(
     """
     Main entry point for asset processing with deduplication.
     """
-    # 1. Load asset metadata
-    with pg_pool.get_pg_conn() as conn:
-        cur = conn.cursor()
-        cur.execute(
-            """
-            SELECT asset_id, type, status, original_url, mime_type, content_hash
-            FROM assets
-            WHERE asset_id = %s
-            """,
-            (asset_id,),
-        )
-        row = cur.fetchone()
-        if not row:
-            raise RuntimeError(f"Asset not found: {asset_id}")
-
-        _, typ, status, original_url, mime_type, content_hash = row
-
-    # 2. Early exit if already processed
-    if status in (AssetStatus.READY.value, AssetStatus.DUPLICATE.value):
-        logger.info("Asset %s already in final state: %s", asset_id, status)
-        return
+    with tracer.start_as_current_span("process.dispatch") as span:
+        # asset_id is fine as a span attribute (high cardinality is OK on traces),
+        # but must NEVER become a metric label.
+        span.set_attribute("asset_id", asset_id)
 
-    # 3. Proceed with processing
-    local_raw_file = None
-    try:
-        # Mark as processing
+        # 1. Load asset metadata
         with pg_pool.get_pg_conn() as conn:
             cur = conn.cursor()
             cur.execute(
-                "UPDATE assets SET status = %s WHERE asset_id = %s",
-                (AssetStatus.PROCESSING.value, asset_id)
+                """
+                SELECT asset_id, type, status, original_url, mime_type, content_hash
+                FROM assets
+                WHERE asset_id = %s
+                """,
+                (asset_id,),
             )
-
-        # Download raw file
-        raw_key = f"media/raw/{asset_id}"
-        tmp_dir = cfg.temp_dir
-        os.makedirs(tmp_dir, exist_ok=True)
-        local_raw_file = os.path.join(
-            tmp_dir, f"{asset_id}-raw.{get_extension_for_mime(mime_type)}"
-        )
-        storage.download_to_file(raw_key, local_raw_file)
-
-        content_hash = compute_file_hash(local_raw_file)
-
-        # Check for duplicate using the actual downloaded file's hash
-        if content_hash:
-            dedup_result = check_for_duplicate(content_hash, asset_id, pg_pool)
-
-            if dedup_result == DedupResult.DUPLICATE_READY:
-                logger.info("Asset %s deduplicated successfully", asset_id)
-                return
-            elif dedup_result == DedupResult.DUPLICATE_PENDING:
-                raise RetryableException(
-                    f"Canonical asset for {asset_id} not ready yet"
+            row = cur.fetchone()
+            if not row:
+                raise RuntimeError(f"Asset not found: {asset_id}")
+
+            _, typ, status, original_url, mime_type, content_hash = row
+
+        span.set_attribute("asset.type", typ or "unknown")
+        span.set_attribute("asset.status", status or "unknown")
+
+        # 2. Early exit if already processed
+        if status in (AssetStatus.READY.value, AssetStatus.DUPLICATE.value):
+            logger.info("Asset %s already in final state: %s", asset_id, status)
+            span.set_attribute("dispatch.short_circuit", "already_final")
+            return
+
+        # 3. Proceed with processing
+        local_raw_file = None
+        proc_start = time.time()
+        try:
+            # Mark as processing
+            with pg_pool.get_pg_conn() as conn:
+                cur = conn.cursor()
+                cur.execute(
+                    "UPDATE assets SET status = %s WHERE asset_id = %s",
+                    (AssetStatus.PROCESSING.value, asset_id)
                 )
 
-        # Process based on type
-        if typ == "image":
-            process_image_file(
-                asset_id, local_raw_file, content_hash, pg_pool, storage, cfg
-            )
-        elif typ == "video":
-            process_video_file(
-                asset_id, local_raw_file, content_hash, pg_pool, storage, cfg
+            # Download raw file
+            raw_key = f"media/raw/{asset_id}"
+            tmp_dir = cfg.temp_dir
+            os.makedirs(tmp_dir, exist_ok=True)
+            local_raw_file = os.path.join(
+                tmp_dir, f"{asset_id}-raw.{get_extension_for_mime(mime_type)}"
             )
-        else:
-            raise ValueError(f"Unknown asset type: {typ}")
-
-    except Exception as e:
-        # Do not touch assets.status here. The consumer (_handle_job) owns the
-        # asset state transition: it marks the asset failed only after the retry
-        # cap is hit, and ready on success. Writing 'failed' on every exception
-        # — including RetryableException — left the asset stuck failed across
-        # retries even though the job was still pending. See DEV-34.
-        logger.error("Failed to process asset %s: %s", asset_id, e, exc_info=True)
-        raise
-    finally:
-        if local_raw_file and os.path.exists(local_raw_file):
-            try:
-                os.unlink(local_raw_file)
-            except OSError:
-                logger.warning("Failed to delete temp file %s", local_raw_file)
+            with tracer.start_as_current_span("process.download") as dl_span:
+                dl_span.set_attribute("asset_id", asset_id)
+                dl_span.set_attribute("storage.key", raw_key)
+                storage.download_to_file(raw_key, local_raw_file)
+                try:
+                    dl_span.set_attribute(
+                        "download.size_bytes", os.path.getsize(local_raw_file)
+                    )
+                except OSError:
+                    pass
+
+            content_hash = compute_file_hash(local_raw_file)
+
+            # Check for duplicate using the actual downloaded file's hash
+            if content_hash:
+                with tracer.start_as_current_span("process.dedup_check") as dd_span:
+                    dd_span.set_attribute("asset_id", asset_id)
+                    dd_span.set_attribute("content_hash", content_hash)
+                    dedup_result = check_for_duplicate(content_hash, asset_id, pg_pool)
+                    dd_span.set_attribute("dedup.result", dedup_result.value)
+
+                if dedup_result == DedupResult.DUPLICATE_READY:
+                    logger.info("Asset %s deduplicated successfully", asset_id)
+                    span.set_attribute("dispatch.short_circuit", "deduplicated")
+                    return
+                elif dedup_result == DedupResult.DUPLICATE_PENDING:
+                    raise RetryableException(
+                        f"Canonical asset for {asset_id} not ready yet"
+                    )
+
+            # Process based on type
+            proc_start = time.time()
+            if typ == "image":
+                process_image_file(
+                    asset_id, local_raw_file, content_hash, pg_pool, storage, cfg
+                )
+            elif typ == "video":
+                process_video_file(
+                    asset_id, local_raw_file, content_hash, pg_pool, storage, cfg
+                )
+            else:
+                raise ValueError(f"Unknown asset type: {typ}")
+
+            # Asset processing duration, labelled by type only (low cardinality).
+            # Feeds the image/video "ready latency" SLIs. asset_id stays a span
+            # attribute, never a metric label.
+            wm.record_asset(typ, time.time() - proc_start, success=True)
+
+        except Exception as e:
+            # Do not touch assets.status here. The consumer (_handle_job) owns the
+            # asset state transition: it marks the asset failed only after the retry
+            # cap is hit, and ready on success. Writing 'failed' on every exception
+            # — including RetryableException — left the asset stuck failed across
+            # retries even though the job was still pending. See DEV-34.
+            logger.error("Failed to process asset %s: %s", asset_id, e, exc_info=True)
+            wm.record_asset(typ, time.time() - proc_start, success=False)
+            raise
+        finally:
+            if local_raw_file and os.path.exists(local_raw_file):
+                try:
+                    os.unlink(local_raw_file)
+                except OSError:
+                    logger.warning("Failed to delete temp file %s", local_raw_file)
 
 
 def clone_image_variants(cur, canonical_asset_id: str, new_asset_id: str) -> int:
diff --git a/worker/processing/videos.py b/worker/processing/videos.py
index 4432e6a..30eabe0 100644
--- a/worker/processing/videos.py
+++ b/worker/processing/videos.py
@@ -5,15 +5,25 @@
 import subprocess
 import tempfile
 
+from opentelemetry import trace
+
 logger = logging.getLogger("videos")
+tracer = trace.get_tracer("worker.processing.videos")
 
 
 def run(cmd: list[str]) -> None:
     """Execute ffmpeg command with error handling."""
     logger.info("ffmpeg: %s", " ".join(cmd))
-    res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if res.returncode != 0:
-        raise RuntimeError(f"FFmpeg failed: {res.stderr.decode()}")
+    with tracer.start_as_current_span("ffmpeg.exec") as span:
+        # cmd[0] is the binary (ffmpeg/ffprobe); record it without the full
+        # argv to avoid leaking paths as high-cardinality span attributes.
+        span.set_attribute("ffmpeg.binary", cmd[0] if cmd else "")
+        res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        span.set_attribute("ffmpeg.returncode", res.returncode)
+        if res.returncode != 0:
+            err = res.stderr.decode()
+            span.set_status(trace.StatusCode.ERROR, "ffmpeg failed")
+            raise RuntimeError(f"FFmpeg failed: {err}")
 
 
 def probe_video(local_path: str) -> dict:
@@ -152,9 +162,14 @@ def process_video_file(asset_id, local_raw_path, content_hash, pg_pool, storage,
             (content_hash, asset_id),
         )
 
-    generate_poster(asset_id, local_raw_path, storage, cfg, pg_pool)
-    transcode_720p(asset_id, local_raw_path, storage, cfg, pg_pool)
-    generate_preview(asset_id, local_raw_path, storage, cfg, pg_pool)
+    for stage_name, fn in (
+        ("video.poster", generate_poster),
+        ("video.transcode_720p", transcode_720p),
+        ("video.preview", generate_preview),
+    ):
+        with tracer.start_as_current_span(stage_name) as span:
+            span.set_attribute("asset_id", asset_id)
+            fn(asset_id, local_raw_path, storage, cfg, pg_pool)
 
     # Mark asset ready
     with pg_pool.get_pg_conn() as conn:
diff --git a/worker/tests/test_consumer_tracing.py b/worker/tests/test_consumer_tracing.py
new file mode 100644
index 0000000..4e6eca7
--- /dev/null
+++ b/worker/tests/test_consumer_tracing.py
@@ -0,0 +1,109 @@
+import unittest
+from unittest.mock import MagicMock, patch
+
+from opentelemetry.propagate import set_global_textmap
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+from opentelemetry.trace.propagation.tracecontext import (
+    TraceContextTextMapPropagator,
+)
+
+from worker.consumer.consumer import Consumer
+
+
+TRACE_ID_HEX = "0af7651916cd43dd8448eb211c80319c"
+SPAN_ID_HEX = "b7ad6b7169203331"
+TRACEPARENT = f"00-{TRACE_ID_HEX}-{SPAN_ID_HEX}-01"
+
+
+def _make_consumer():
+    cfg = MagicMock()
+    cfg.stream_name = "media:jobs"
+    cfg.consumer_group = "media-workers"
+    with patch("worker.consumer.consumer.redis.Redis.from_url") as from_url:
+        from_url.return_value = MagicMock()
+        consumer = Consumer(
+            pg_pool=MagicMock(), redis_url="redis://x", storage=MagicMock(), cfg=cfg
+        )
+    return consumer, from_url.return_value
+
+
+class TestConsumeSpanPropagation(unittest.TestCase):
+    """Phase 1d: the worker continues the producer trace across the queue."""
+
+    def setUp(self):
+        set_global_textmap(TraceContextTextMapPropagator())
+        self.exporter = InMemorySpanExporter()
+        provider = TracerProvider()
+        provider.add_span_processor(SimpleSpanProcessor(self.exporter))
+        self.tracer = provider.get_tracer("test")
+
+    def _run_consume_with(self, fields):
+        consumer, client = _make_consumer()
+        client.xreadgroup.return_value = [("media:jobs", [("1-0", fields)])]
+        consumer._recover_stuck_pending = MagicMock()
+        consumer._handle_job = MagicMock()
+        consumer._handle_asset_message = MagicMock()
+        with patch(
+            "worker.consumer.consumer.get_tracer", return_value=self.tracer
+        ):
+            consumer.consume("worker-1")
+        return consumer
+
+    def test_consume_span_is_child_and_linked_to_producer(self):
+        consumer = self._run_consume_with({"job_id": "42", "traceparent": TRACEPARENT})
+        consumer._handle_job.assert_called_once_with("42", "1-0")
+
+        spans = self.exporter.get_finished_spans()
+        consume = next(s for s in spans if s.name == "worker.consume")
+
+        expected_trace_id = int(TRACE_ID_HEX, 16)
+        expected_span_id = int(SPAN_ID_HEX, 16)
+
+        # Child: parent is the producer context, and the span continues the trace.
+        self.assertIsNotNone(consume.parent)
+        self.assertEqual(consume.parent.trace_id, expected_trace_id)
+        self.assertEqual(consume.parent.span_id, expected_span_id)
+        self.assertEqual(consume.context.trace_id, expected_trace_id)
+
+        # Link: queue fan-in primitive points at the same producer context.
+        self.assertTrue(consume.links)
+        self.assertEqual(consume.links[0].context.trace_id, expected_trace_id)
+
+    def test_consume_without_traceparent_starts_new_trace(self):
+        consumer = self._run_consume_with({"job_id": "42"})
+        consumer._handle_job.assert_called_once_with("42", "1-0")
+        spans = self.exporter.get_finished_spans()
+        consume = next(s for s in spans if s.name == "worker.consume")
+        # No producer context -> root span, no link.
+        self.assertIsNone(consume.parent)
+        self.assertFalse(consume.links)
+
+    def test_traceparent_merged_from_body(self):
+        import json
+
+        body = json.dumps({"job_id": "42"})
+        consumer = self._run_consume_with({"body": body, "traceparent": TRACEPARENT})
+        consumer._handle_job.assert_called_once_with("42", "1-0")
+        spans = self.exporter.get_finished_spans()
+        consume = next(s for s in spans if s.name == "worker.consume")
+        self.assertEqual(consume.parent.trace_id, int(TRACE_ID_HEX, 16))
+
+
+class TestTracerInit(unittest.TestCase):
+    def test_init_tracing_sets_tracer(self):
+        import worker.utils.tracing as tracing
+
+        # Reset module state for a clean init.
+        tracing._tracer = None
+        tracing._provider = None
+        tracing.init_tracing(endpoint="otel-collector:4317", deployment_env="local")
+        self.assertIsNotNone(tracing.get_tracer())
+        tracing.shutdown_tracing()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/worker/tests/test_logging_correlation.py b/worker/tests/test_logging_correlation.py
new file mode 100644
index 0000000..6962fcc
--- /dev/null
+++ b/worker/tests/test_logging_correlation.py
@@ -0,0 +1,38 @@
+import logging
+import unittest
+
+from opentelemetry.sdk.trace import TracerProvider
+
+from worker.utils.logger import TraceContextFilter
+
+
+class TestTraceContextFilter(unittest.TestCase):
+    """Phase 2b: log records carry the active span's trace_id/span_id."""
+
+    def _record(self):
+        return logging.LogRecord(
+            name="t", level=logging.INFO, pathname=__file__, lineno=1,
+            msg="hello", args=(), exc_info=None,
+        )
+
+    def test_no_span_emits_empty(self):
+        f = TraceContextFilter()
+        rec = self._record()
+        f.filter(rec)
+        self.assertEqual(rec.trace_id, "")
+        self.assertEqual(rec.span_id, "")
+
+    def test_active_span_stamps_ids(self):
+        provider = TracerProvider()
+        tracer = provider.get_tracer("test")
+        f = TraceContextFilter()
+        with tracer.start_as_current_span("s"):
+            rec = self._record()
+            f.filter(rec)
+            self.assertEqual(len(rec.trace_id), 32)
+            self.assertEqual(len(rec.span_id), 16)
+            self.assertRegex(rec.trace_id, r"^[0-9a-f]{32}$")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/worker/tests/test_pipeline_spans.py b/worker/tests/test_pipeline_spans.py
new file mode 100644
index 0000000..b223c4f
--- /dev/null
+++ b/worker/tests/test_pipeline_spans.py
@@ -0,0 +1,77 @@
+import unittest
+from unittest.mock import MagicMock, patch
+
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+from opentelemetry import trace
+
+import worker.processing.processor as processor
+
+
+class TestPipelineStageSpans(unittest.TestCase):
+    """Phase 2a: dispatch emits download + dedup (+ delegate) spans."""
+
+    def setUp(self):
+        self.exporter = InMemorySpanExporter()
+        provider = TracerProvider()
+        provider.add_span_processor(SimpleSpanProcessor(self.exporter))
+        # Point the module-level proxy tracer at our in-memory provider.
+        self._tracer = provider.get_tracer("test")
+        self._orig = processor.tracer
+        processor.tracer = self._tracer
+
+    def tearDown(self):
+        processor.tracer = self._orig
+
+    def _pg_pool_returning(self, asset_row):
+        cursor = MagicMock()
+        cursor.fetchone.return_value = asset_row
+        conn = MagicMock()
+        conn.cursor.return_value = cursor
+        pg = MagicMock()
+        pg.get_pg_conn.return_value.__enter__.return_value = conn
+        return pg
+
+    @patch("worker.processing.processor.get_extension_for_mime", return_value="jpg")
+    @patch("worker.processing.processor.compute_file_hash", return_value="")
+    @patch("worker.processing.processor.process_image_file")
+    @patch("worker.processing.processor.os.path.exists", return_value=False)
+    def test_image_dispatch_emits_stage_spans(
+        self, _exists, mock_img, _hash, _ext
+    ):
+        asset_row = ("a1", "image", "uploaded", "u", "image/jpeg", None)
+        pg = self._pg_pool_returning(asset_row)
+        storage = MagicMock()
+        cfg = MagicMock()
+        cfg.temp_dir = "/tmp"
+
+        processor.process_asset_dispatch("a1", pg, storage, cfg)
+
+        names = {s.name for s in self.exporter.get_finished_spans()}
+        self.assertIn("process.dispatch", names)
+        self.assertIn("process.download", names)
+        mock_img.assert_called_once()
+
+    @patch("worker.processing.processor.get_extension_for_mime", return_value="jpg")
+    @patch("worker.processing.processor.compute_file_hash", return_value="abc123")
+    @patch("worker.processing.processor.check_for_duplicate")
+    @patch("worker.processing.processor.process_image_file")
+    @patch("worker.processing.processor.os.path.exists", return_value=False)
+    def test_dedup_span_emitted_when_hash_present(
+        self, _exists, mock_img, mock_dedup, _hash, _ext
+    ):
+        mock_dedup.return_value = processor.DedupResult.NO_DUPLICATE
+        asset_row = ("a1", "image", "uploaded", "u", "image/jpeg", None)
+        pg = self._pg_pool_returning(asset_row)
+
+        processor.process_asset_dispatch("a1", pg, MagicMock(), MagicMock(temp_dir="/tmp"))
+
+        names = {s.name for s in self.exporter.get_finished_spans()}
+        self.assertIn("process.dedup_check", names)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/worker/utils/logger.py b/worker/utils/logger.py
index bbc642c..6b2122b 100644
--- a/worker/utils/logger.py
+++ b/worker/utils/logger.py
@@ -2,8 +2,33 @@
 import os
 from typing import Optional
 
+from opentelemetry import trace
 
-_DEFAULT_FORMAT = "%(asctime)s %(levelname)s [%(name)s] %(message)s"
+
+# trace_id=<hex> matches the Grafana Loki derived-field regex (trace_id=(\w+)),
+# which links each log line to its Tempo trace. span_id is included for context.
+_DEFAULT_FORMAT = (
+    "%(asctime)s %(levelname)s [%(name)s] "
+    "trace_id=%(trace_id)s span_id=%(span_id)s %(message)s"
+)
+
+
+class TraceContextFilter(logging.Filter):
+    """Inject the active span's trace_id/span_id into every log record.
+
+    Emits empty strings when there is no active recording span, so the Grafana
+    derived field does not create a link to a non-existent trace.
+    """
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        ctx = trace.get_current_span().get_span_context()
+        if ctx is not None and ctx.is_valid:
+            record.trace_id = format(ctx.trace_id, "032x")
+            record.span_id = format(ctx.span_id, "016x")
+        else:
+            record.trace_id = ""
+            record.span_id = ""
+        return True
 
 
 def setup_logging(
@@ -35,6 +60,12 @@ def setup_logging(
         format=fmt,
     )
 
+    # Attach the trace-context filter at the handler level so it stamps every
+    # record flowing through, regardless of which logger emitted it.
+    trace_filter = TraceContextFilter()
+    for handler in logging.getLogger().handlers:
+        handler.addFilter(trace_filter)
+
     # Silence noisy libraries (optional, but recommended)
     logging.getLogger("urllib3").setLevel(logging.WARNING)
     logging.getLogger("botocore").setLevel(logging.WARNING)
diff --git a/worker/utils/metrics.py b/worker/utils/metrics.py
index 82af1be..27d1004 100644
--- a/worker/utils/metrics.py
+++ b/worker/utils/metrics.py
@@ -18,6 +18,10 @@
 from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
 from opentelemetry.sdk.metrics import MeterProvider
 from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+from opentelemetry.sdk.metrics.view import (
+    ExplicitBucketHistogramAggregation,
+    View,
+)
 from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION, DEPLOYMENT_ENVIRONMENT, SERVICE_INSTANCE_ID
 
 from worker.utils.logger import get_logger
@@ -97,8 +101,27 @@ def init_metrics(
     # Create metric reader with 15-second export interval
     reader = PeriodicExportingMetricReader(exporter, export_interval_millis=15000)
 
+    # Finer histogram buckets for duration metrics. The SDK default buckets are
+    # too coarse for sub-second work (everything lands in [0,5) so p95 reads
+    # ~4.75s), which makes the image/job latency SLIs meaningless. These cover
+    # tens-of-ms (images) through tens-of-seconds (video transcode).
+    _duration_buckets = [0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300]
+    duration_views = [
+        View(
+            instrument_name=name,
+            aggregation=ExplicitBucketHistogramAggregation(_duration_buckets),
+        )
+        for name in (
+            "mpiper.asset.processing.duration",
+            "mpiper.job.processing.duration",
+            "mpiper.queue.processing.duration",
+        )
+    ]
+
     # Create meter provider
-    provider = MeterProvider(resource=resource, metric_readers=[reader])
+    provider = MeterProvider(
+        resource=resource, metric_readers=[reader], views=duration_views
+    )
     metrics.set_meter_provider(provider)
 
     # Get meter
@@ -220,6 +243,40 @@ def init_metrics(
     logger.info("OpenTelemetry metrics initialized successfully")
 
 
+def record_consume() -> None:
+    """Count one consumed queue message (no-op until init_metrics runs)."""
+    if queue_message_consumed is not None:
+        queue_message_consumed.add(1)
+
+
+def record_job(success: bool, duration_seconds: float) -> None:
+    """Record job-level outcome + duration. Safe before init (no-op)."""
+    if job_processing_total is not None:
+        job_processing_total.add(1)
+    if success and job_processing_success is not None:
+        job_processing_success.add(1)
+    if (not success) and job_processing_failed is not None:
+        job_processing_failed.add(1)
+    if job_processing_duration is not None:
+        job_processing_duration.record(duration_seconds)
+
+
+def record_asset(asset_type: str, duration_seconds: float, success: bool) -> None:
+    """Record asset processing outcome + duration, labelled by type only.
+
+    asset_type is low-cardinality (image/video); asset_id must never be a label.
+    """
+    attrs = {"asset_type": asset_type or "unknown"}
+    if asset_processing_total is not None:
+        asset_processing_total.add(1, attrs)
+    if success and asset_processing_success is not None:
+        asset_processing_success.add(1, attrs)
+    if (not success) and asset_processing_failed is not None:
+        asset_processing_failed.add(1, attrs)
+    if asset_processing_duration is not None:
+        asset_processing_duration.record(duration_seconds, attrs)
+
+
 def get_meter() -> Optional[metrics.Meter]:
     """Get the global meter instance.
     
diff --git a/worker/utils/tracing.py b/worker/utils/tracing.py
new file mode 100644
index 0000000..c2cadf5
--- /dev/null
+++ b/worker/utils/tracing.py
@@ -0,0 +1,121 @@
+"""
+worker.utils.tracing
+
+OpenTelemetry tracing initialization for the Python worker.
+
+Mirrors `worker.utils.metrics`: an OTLP gRPC exporter to the same collector
+endpoint, a BatchSpanProcessor, and the SAME W3C propagators as the Go API
+(`traceparent` + `baggage`) so the trace continues across the Redis boundary
+instead of starting fresh.
+
+The worker had OTel metric instruments but no tracer and no context extraction,
+so the distributed trace died at the queue. This closes that gap on the consumer
+side; `worker.consumer.consumer` extracts the producer context and starts the
+consume span as a child (with a link) of it.
+"""
+
+from typing import Optional
+
+from opentelemetry import trace
+from opentelemetry.baggage.propagation import W3CBaggagePropagator
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.propagate import set_global_textmap
+from opentelemetry.propagators.composite import CompositePropagator
+from opentelemetry.sdk.resources import (
+    DEPLOYMENT_ENVIRONMENT,
+    SERVICE_INSTANCE_ID,
+    SERVICE_NAME,
+    SERVICE_VERSION,
+    Resource,
+)
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.trace.sampling import ALWAYS_ON, ParentBased, TraceIdRatioBased
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+
+from worker.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+# Global tracer for the worker
+_tracer: Optional[trace.Tracer] = None
+_provider: Optional[TracerProvider] = None
+
+
+def _build_sampler(deployment_env: str, sampling_rate: float):
+    """AlwaysSample in dev/local; parent-based ratio sampling otherwise.
+
+    Matches the Go API's getSampler() so both services agree on what to keep.
+    """
+    if deployment_env in ("development", "dev", "local", ""):
+        return ALWAYS_ON
+    return ParentBased(root=TraceIdRatioBased(sampling_rate))
+
+
+def init_tracing(
+    service_name: str = "mpiper-worker",
+    service_version: str = "dev",
+    endpoint: str = "otel-collector:4317",
+    deployment_env: str = "development",
+    instance_id: Optional[str] = None,
+    tls_insecure: bool = True,
+    sampling_rate: float = 1.0,
+) -> None:
+    """Initialize OpenTelemetry tracing with an OTLP gRPC span exporter.
+
+    Parameters should be sourced from the centralised config (get_config().otel).
+    Idempotent: a second call is a no-op so the worker can call it safely on
+    startup alongside init_metrics.
+    """
+    global _tracer, _provider
+
+    if _tracer is not None:
+        logger.warning("Tracing already initialized")
+        return
+
+    if "://" in endpoint:
+        endpoint = endpoint.split("://", 1)[1]
+
+    logger.info(f"Initializing OpenTelemetry tracer with endpoint: {endpoint}")
+
+    resource = Resource.create(
+        {
+            SERVICE_NAME: service_name,
+            SERVICE_VERSION: service_version,
+            DEPLOYMENT_ENVIRONMENT: deployment_env,
+            SERVICE_INSTANCE_ID: instance_id or service_name,
+        }
+    )
+
+    exporter = OTLPSpanExporter(endpoint=endpoint, insecure=tls_insecure)
+
+    provider = TracerProvider(
+        resource=resource,
+        sampler=_build_sampler(deployment_env, sampling_rate),
+    )
+    provider.add_span_processor(BatchSpanProcessor(exporter))
+    trace.set_tracer_provider(provider)
+
+    # Same propagators as the Go API (composite TraceContext + Baggage) so the
+    # traceparent the producer injected is understood here.
+    set_global_textmap(
+        CompositePropagator(
+            [TraceContextTextMapPropagator(), W3CBaggagePropagator()]
+        )
+    )
+
+    _provider = provider
+    _tracer = trace.get_tracer(__name__)
+    logger.info("OpenTelemetry tracer initialized successfully")
+
+
+def get_tracer() -> Optional[trace.Tracer]:
+    """Return the global worker tracer, or None if init_tracing was not called."""
+    return _tracer
+
+
+def shutdown_tracing() -> None:
+    """Flush and shut down the tracer provider on exit."""
+    if _provider is not None:
+        _provider.shutdown()
+        logger.info("Tracer provider shutdown complete")