diff --git a/cmd/server/main.go b/cmd/server/main.go index bb4f886..9a2a2a2 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "database/sql" "errors" "fmt" "net/http" @@ -125,6 +126,13 @@ func main() { _ = m.RegisterOutboxPendingFunc(func(ctx context.Context) (int64, error) { return outboxRepo.CountPending(ctx) }) + + // Observe the database connection-pool stats (in-use / idle / open / max / + // wait count). sqlx.DB embeds *sql.DB, so db.Stats() exposes pool saturation + // — the key signal for whether the DB pool is a bottleneck under load. + _ = m.RegisterDBStatsFunc(func() sql.DBStats { + return db.Stats() + }) go relay.Start(serverCtx) go relay.StartCleanup(serverCtx, cfg.Outbox.Retention) diff --git a/docker-compose.loadtest.yml b/docker-compose.loadtest.yml new file mode 100644 index 0000000..55fadc9 --- /dev/null +++ b/docker-compose.loadtest.yml @@ -0,0 +1,54 @@ +# ============================================================================ +# MPiper — Load-test overlay (Track 3, Phase 0) +# +# docker compose \ +# -f docker-compose.yml \ +# -f docker-compose.observability.yml \ +# -f docker-compose.loadtest.yml \ +# up -d --build +# +# Why this exists +# --------------- +# Local load-test results are only interpretable when resources are PINNED, so +# "the worker is the bottleneck" is a stable, observable fact rather than +# something that moves run to run depending on spare laptop cores. This overlay: +# +# 1. Caps CPU/memory on `api` and `worker` via `deploy.resources.limits` +# (honoured by `docker compose up` in Compose v2 — no swarm needed). +# 2. Forces TRACE_SAMPLING_RATE=1.0 so every asset produces a full trace. +# +# The worker is intentionally pinned to a single CPU: it is single-threaded +# today (Track 1), and pinning makes that saturation visible and reproducible. +# Document the limits alongside every experiment — changing them changes ffmpeg +# thread behaviour, which is the point, but it must be recorded for comparison. +# ============================================================================ + +name: mpiper + +services: + api: + environment: + # Full sampling locally — never lose a trace to the 0.1 code default. + TRACE_SAMPLING_RATE: "1.0" + deploy: + resources: + limits: + cpus: "1.0" + memory: 512M + reservations: + cpus: "0.25" + memory: 128M + + worker: + environment: + # Full sampling locally; matches the API so the whole trace is captured. + TRACE_SAMPLING_RATE: "1.0" + deploy: + resources: + limits: + # Single CPU on purpose — surfaces the single-threaded bottleneck. + cpus: "1.0" + memory: 1024M + reservations: + cpus: "0.5" + memory: 256M diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml index 65d4760..7ff8300 100644 --- a/docker-compose.observability.yml +++ b/docker-compose.observability.yml @@ -21,7 +21,7 @@ services: # Grafana Tempo - Distributed Tracing Backend # ========================================================================== tempo: - image: grafana/tempo:latest + image: grafana/tempo:2.6.1 container_name: mpiper-tempo command: ["-config.file=/etc/tempo.yaml"] volumes: @@ -102,8 +102,14 @@ services: - '--web.console.libraries=/usr/share/prometheus/console_libraries' - '--web.console.templates=/usr/share/prometheus/consoles' - '--web.enable-lifecycle' + # Accept k6's Prometheus remote-write output (Track 3, Phase 4) so client- + # side load metrics land in the same Prometheus as the server-side ones. + - '--web.enable-remote-write-receiver' + # Exemplar storage powers the histogram-bucket -> Tempo trace links. + - '--enable-feature=exemplar-storage' volumes: - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml + - ./observability/prometheus.rules.yml:/etc/prometheus/prometheus.rules.yml - prometheus-data:/prometheus ports: - "9090:9090" diff --git a/docs/enhancements/README.md b/docs/enhancements/README.md new file mode 100644 index 0000000..9e79164 --- /dev/null +++ b/docs/enhancements/README.md @@ -0,0 +1,204 @@ +# MPiper Enhancements — Roadmap + +This directory tracks the work that takes MPiper from a well-built side project +to a production-grade media platform. Each **track** is chosen to teach a +distinct, transferable systems-engineering concept *and* to add real product +value — not feature-padding. + +The philosophy: **write a design doc per track before coding** (problem, options, +decision, tradeoffs, how success is measured), and pair every track with a +load test or chaos experiment so each claim ("now it scales", "now it's +exactly-once") is *demonstrated*, not assumed. + +> **Progress:** Track 3 (observability + load testing) is **done** ✅. It shipped +> the foundation that makes everything below *measurable* — end-to-end tracing, +> SLOs, Grafana dashboards, and a k6 load harness. The first load test +> ([`experiments/0001-worker-saturation.md`](../../experiments/0001-worker-saturation.md)) +> already re-ordered this roadmap with hard data instead of hunches. + +## Where we are today + +A clean, correct, **single-tenant, best-effort, single-node-throughput** pipeline +with good bones — now fully observable: + +- Transactional enqueue via an **outbox relay** (Postgres → Redis Streams). +- An **idempotent-ish consumer** with content-hash dedup. +- **Presigned uploads** with a split internal/public storage endpoint. +- **Webhooks** with HMAC signing + exponential backoff. +- **End-to-end distributed tracing** (API → outbox → Redis → worker → ffmpeg, one + waterfall per asset), **OTel metrics** on API and worker, **SLO recording rules**, + provisioned **Grafana dashboards**, and a host-run **k6 load harness** — on the + bundled Grafana/Tempo/Loki/Prometheus stack. + +Known seams where "side project" becomes "system" (verified in code, and now +several of them **measured** under load): + +- **Single-threaded worker** — `MAX_CONCURRENT_JOBS` exists in config but is never + used; `consume()` pulls one message (`count=1`) and processes it inline. + *Measured:* μ ≈ **1.1 jobs/s**, worker CPU **98%**, queue depth → **2,544**. The + hard throughput ceiling and the confirmed **#1 bottleneck**. +- **Webhook dispatcher can't keep up** — a 2s poll × batch-50 loop delivering + webhooks with synchronous HTTP + retries. *Measured:* `webhook_pending` peaked at + **~5,900** and never drained. A second, independent bottleneck. +- **Homegrown recovery** — a 2-min DB-scan + re-`XADD`, not Redis Streams' + `XPENDING`/`XAUTOCLAIM` consumer-group recovery; poison messages are marked + `failed` and dropped (no dead-letter stream). +- **No raw-upload lifecycle** — objects in `media/raw/` are never deleted. + *Measured:* ~**50%** of presigned uploads are never completed → orphaned objects + accumulate. +- **Homegrown auth** — an AES-GCM token with no expiry/rotation, and the same + `ENCRYPTION_KEY` signs both auth tokens and webhook secrets. +- **Polled high-churn tables** (`jobs`, `event_outbox`, `webhook_deliveries`), grown + unbounded with cleanup-by-retention only. *Measured:* `event_outbox` kept up with + **0 backlog** and the DB had headroom (**18 ms** mean query, **5/25** connections); + only `webhook_deliveries` actually strained. + +## What the first load test proved (exp 0001) + +Track 3 gave us the instrumentation to stop guessing. The first saturating run +(`open --rate 10/s`, CPU-pinned worker) turned the seams above into a **measured, +ranked** list — and every track below now has a baseline to beat by re-running the +*same* k6 profile and comparing the dashboards. + +| Finding (measured) | What it means | Owner | +|---|---|---| +| Worker μ ≈ **1.1 jobs/s**, CPU 98%, queue → **2,544** | Single-threaded worker is the throughput ceiling | **Track 1 (P0)** | +| `webhook_pending` → **5,901**, never drains | Dispatcher delivery rate ≪ insertion rate | **Track 1b (P1, new)** | +| `event_outbox` **0 backlog**; DB **18 ms** mean, **5/25** conns | Outbox + DB have large headroom *today* | Track 7 → **defer** | +| `webhook_deliveries` is the one polled table straining | The real, current trigger for data-layer work | Track 7 → **rescope to this** | +| **~50%** of presigns never completed → orphaned `media/raw/` | Storage grows with abandoned uploads | Track 5 (small) | +| `/complete` p99 **358 ms** (synchronous MinIO HEAD) | Minor hot-path tail | Track 5 (small) | + +Net effect: **Track 1 is confirmed P0**, a **webhook-throughput bottleneck was +surfaced that no track owned** (now Track 1b), and **Track 7's table-partitioning is +premature** — the DB isn't the problem yet; the webhook *delivery loop* is. + +## Tracks + +| # | Track | Core systems lesson | Status | +|---|-------|---------------------|--------| +| 1 | [Concurrent worker + proper stream recovery + DLQ](track-01-concurrent-worker.md) | Concurrency models, at-least-once recovery, poison-message handling, head-of-line blocking | **next — P0 (data-confirmed)** | +| 1b | Webhook delivery throughput *(surfaced by exp 0001)* | Concurrent I/O-bound delivery, backpressure on a side-channel, decoupling fan-out from job completion | **new — P1** | +| 2 | [Queue-depth autoscaling](track-02-autoscaling.md) | Backpressure, control loops, Little's Law, SLO-driven capacity | planned (after T1) | +| 3 | [End-to-end tracing, SLOs & local load testing](track-03-observability-and-load.md) | Context propagation across async boundaries, the three pillars, SLO/SLI/error budgets, load-test methodology | **done ✅** | +| 4 | [Multi-tenancy, auth & quotas](track-04-multitenancy-auth.md) | AuthN vs AuthZ, key rotation, the idempotency pattern, tenant isolation | planned | +| 5 | [Production ingestion pipeline](track-05-ingestion.md) | Resumable/multipart uploads, pipeline stages, defense-in-depth, trust boundaries | planned | +| 6 | [Adaptive streaming + CDN](track-06-adaptive-streaming.md) | ABR streaming, CDN cache/invalidation, edge auth, encoding cost/quality tradeoffs | planned | +| 7 | [Data layer at scale](track-07-data-layer.md) | Table partitioning, CDC vs polling, index design under write load | **deferred — rescope to `webhook_deliveries`** | +| 8 | [Resilience & correctness verification](track-08-resilience.md) | Failure-mode analysis, exactly-once in practice, replay attacks, chaos engineering | planned | + +> Track 3 is the only track with a full design doc checked in, because it was built +> first. Now that it's done, every track below is **measurable**: implement, re-run +> the same k6 profile, compare dashboards, and record an `experiments/NNNN-*.md` +> writeup. "It scales" is a claim we can prove, not assert. + +## Recommended sequence (re-prioritized from exp 0001 data) + +1. **Track 1 — concurrent worker + DLQ + stream recovery.** **P0, data-confirmed.** + The single-threaded worker is the throughput ceiling (μ ≈ 1.1 jobs/s, queue → 2,544). + Biggest lever; self-contained. *Verify:* re-run `open --rate 10/s` — expect μ to + scale with the pool and the queue to stabilize → `experiments/0002`. +2. **Track 1b — webhook delivery throughput.** **P1, newly surfaced.** Independent of + the worker: `webhook_pending` hit ~5,900 and never drained. Concurrent/batched + delivery + wire the unrecorded `webhook_delivery_*` metrics. Small, high-value. + *Verify:* `webhook_pending` drains under the same load. +3. **Track 2 — autoscaling.** Needs a concurrent worker first; then scale it on the + queue-lag signal we already expose. Now directly measurable. +4. **Track 4 — multi-tenancy + idempotency + auth.** The leap to "real users". +5. **Track 6 — adaptive streaming + CDN.** The headline product feature. +6. **Track 5 — ingestion.** Includes the small wins exp 0001 surfaced: abandoned-upload + lifecycle (~50% orphaned `media/raw/`) and the `/complete` MinIO-HEAD tail. +7. **Track 7 — data layer.** **Deferred and rescoped.** DB/outbox have headroom today; + revisit when volume justifies, scoped first to `webhook_deliveries` churn (the one + polled table that actually strained) rather than blanket partitioning. +8. **Track 8 — resilience & correctness.** Depth once the throughput tracks land. + +> **Track 3 follow-ups (do before the next experiment so p95s aren't distorted):** +> wire the `webhook_delivery_*` and `storage_operation_*` metrics, add a fine-bucket +> view to `db.query.duration`, and standardize histogram buckets across services. + +--- + +## Track catalog (summaries) + +### Track 1 — Concurrent worker + proper stream recovery + DLQ +**Gap:** one job at a time; a 3s video blocks a 200ms thumbnail. Recovery scans +Postgres and re-`XADD`s instead of using consumer-group delivery state. +**Move:** bounded worker pool (process pool for CPU-bound ffmpeg/Pillow vs async +for I/O — *choosing which is the lesson*); honour `MAX_CONCURRENT_JOBS` as a +semaphore; `XAUTOCLAIM`/`XPENDING` to reclaim dead-consumer messages; a +**dead-letter stream** for messages past the attempt cap; priority lanes so small +jobs don't queue behind large transcodes. +**Teaches:** thread vs process vs async, the GIL, CPU vs I/O bound, at-least-once +recovery, poison-message handling, head-of-line blocking. + +### Track 1b — Webhook delivery throughput *(surfaced by exp 0001)* +**Gap:** the dispatcher polls every 2s, batch 50, and delivers webhooks with +*synchronous* HTTP + retries on a single loop. Each job emits 3 events +(`job.starting/started/done`), so insertion rate ≫ delivery rate — the load test +drove `webhook_pending` to ~5,900 with no recovery. Delivery is also under- +instrumented: `webhook_delivery_total/duration/failures` are defined but never +recorded, so only the `pending` gauge revealed the backlog. +**Move:** a bounded pool of concurrent delivery workers (I/O-bound → async/threads +fits); decouple fan-out from job completion; wire the delivery metrics + a +delivery-latency SLI. Optionally move webhook rows onto their own stream consumer +rather than a DB poll. +**Teaches:** concurrency for I/O-bound work, backpressure on a side-channel, +decoupling producers from slow consumers, instrumenting before optimizing. + +### Track 2 — Queue-depth autoscaling +**Gap:** static worker count; bursts grow latency unbounded, idle wastes capacity. +**Move:** expose stream lag + oldest-message-age (extend the existing relay-lag +metric); drive **KEDA** (k8s manifests already exist) to scale workers on lag; +load-test the backlog → scale → drain cycle. +**Teaches:** backpressure, control loops, latency- vs queue-depth-based scaling, +Little's Law (L = λW), capacity planning. + +### Track 4 — Multi-tenancy, auth & quotas +**Gap:** homegrown AES token (no expiry/rotation, shared key with webhook secrets); +single bucket, path-prefixed; no idempotency keys (retried presign = duplicate asset). +**Move:** OIDC/JWT (asymmetric keys, expiry, JWKS rotation) or scoped API keys; +separate webhook-signing secret; org→project→asset model with repository-layer +row scoping and per-tenant storage prefixes/credentials; **idempotency keys** on +`presign`/`complete`; per-tenant **quotas + rate limits** with usage accounting. +**Teaches:** authN vs authZ, key management/rotation, the idempotency pattern, +tenant isolation, security blast-radius. + +### Track 5 — Production ingestion pipeline +**Gap:** single presigned `PUT`, 500MB cap, no resumability, MIME-only validation, +no scanning. Plus (from exp 0001) **no lifecycle for abandoned uploads** — ~50% of +presigns never complete, orphaning `media/raw/` objects. +**Move:** S3 **multipart/resumable** uploads with part-level retry; a validation +stage with real content sniffing (`python-magic` is already a dep); optional +**ClamAV** malware scan as a stage; dedup *before* full download via verified +client-supplied hash; a TTL/lifecycle sweep for un-completed raw uploads. +**Teaches:** large-file transfer, pipeline/stage design, defense-in-depth, trust +boundaries (never trust client content-type). + +### Track 6 — Adaptive streaming + CDN +**Gap:** one fixed 720p MP4 at hardcoded 2500kbps, served straight from MinIO. +**Move:** generate an **HLS/DASH adaptive ladder** (multiple renditions + manifest — +`variants.video.manifest_url` already exists in the schema); serve via **CDN** with +signed URLs + cache-control; content-aware/per-title encoding decisions. +**Teaches:** adaptive bitrate streaming, CDN cache strategy + invalidation, edge +signed-URL access control, encoding cost/quality tradeoffs. + +### Track 7 — Data layer at scale *(deferred — see exp 0001)* +**Gap:** `jobs`, `event_outbox`, `webhook_deliveries` polled and growing. The load +test showed the DB and outbox have **headroom today** (18 ms mean query, 5/25 +connections, 0 outbox backlog), so blanket partitioning is premature — but +`webhook_deliveries` is the one table that genuinely strained. +**Move:** start narrow — partition/clean `webhook_deliveries` and replace its poll +with `LISTEN/NOTIFY` or a stream consumer (overlaps Track 1b). Broaden to the other +tables (monthly partitions; drop instead of DELETE; read replicas; CDC) only when +volume justifies it. +**Teaches:** partitioning, CDC vs polling, write-heavy index design, pool sizing. + +### Track 8 — Resilience & correctness verification +**Gap:** unit + integration tests exist, but no proof of survival under failure/load. +**Move:** **fault injection / chaos** (kill the worker mid-transcode, pause Redis, +fill the disk — verify processed-once holds); **load tests** with latency budgets in +CI; **webhook contract tests** + replay protection (sign a timestamp, reject stale +deliveries — today a captured payload replays forever). +**Teaches:** failure-mode analysis, exactly-once vs at-least-once in practice, +replay attacks, reliability as a tested property. diff --git a/docs/enhancements/track-03-handoff.md b/docs/enhancements/track-03-handoff.md new file mode 100644 index 0000000..089c030 --- /dev/null +++ b/docs/enhancements/track-03-handoff.md @@ -0,0 +1,222 @@ +# Track 3 — Session Handoff (start here) + +**Purpose:** everything a fresh conversation needs to begin **Track 3 +(end-to-end tracing, SLOs & local load testing)** without prior context. Read +this top to bottom, then open `track-03-observability-and-load.md` for the full +design and phased plan. This doc is the *operational* companion: where things +are, how to run them, and the landmines already discovered. + +--- + +## 1. What MPiper is (60-second orientation) + +A media-processing pipeline: a **Go API** (`cmd/server`, `internal/`) accepts +uploads and a **Python worker** (`worker/`) processes them. They communicate over +**Redis Streams** (`media:jobs`). **Postgres** is the durable source of truth; +**MinIO** (S3-compatible) stores objects. Webhooks notify clients of job +lifecycle events. + +**Asset flow:** +`POST /api/v1/storage/presign` → client `PUT`s file to MinIO → +`GET /api/v1/assets/{id}/complete` (writes asset `uploaded` + job + outbox row + +`job.starting` webhook rows in one tx) → **outbox relay** (1s poll) publishes to +Redis → **worker** consumes → image (3 webp variants) or video (poster + 720p + +preview) → variants written to MinIO + Postgres, asset `ready` → worker inserts +`job.started`/`job.done` webhook rows → **dispatcher** (2s poll) delivers signed POSTs. + +--- + +## 2. The Track 3 goal in one sentence + +Make one **trace per asset** that spans API → Redis → worker → ffmpeg (so queue +wait and per-stage time are visible), define a small set of **SLOs**, and build a +**local k6 load harness** + Grafana dashboards so we can saturate the system on a +laptop and *see* where it bends. Full plan: `track-03-observability-and-load.md`. + +--- + +## 3. Current telemetry state (verified in code) + +- **Go API:** OTel **traces + metrics**, exported OTLP to `otel-collector:4317`. + Tracer init in `internal/metrics/otel.go`; metric instruments in + `internal/metrics/metrics.go`. +- **Python worker:** OTel **metrics only** (`worker/utils/metrics.py`, OTLP to + `otel-collector:4317`). **No tracer, no span creation, no context extraction.** +- **The gap:** the Go side traces the HTTP request and `Enqueue`, but **never + injects a `traceparent`** into the Redis message or the outbox row. The worker + therefore starts fresh with no parent. The trace dies at the queue boundary. +- **Observability stack** (`docker-compose.observability.yml`, configs in + `observability/`): OTel Collector (bridges `mpiper_net` ↔ `mpiper_obs_net`), + **Tempo** (traces), **Prometheus** (metrics), **Loki + Promtail** (logs), + **Grafana** (dashboards, anonymous admin). Collector pipeline: OTLP receiver → + Tempo (traces) + Prometheus exporter `:8889` (metrics). + +> Note: `CLAUDE.md` historically said the worker is "prometheus_client (not OTel)" +> — that's **stale/wrong**; the worker uses OTel metrics. Don't trust that line. + +--- + +## 4. Exact engineering targets for Phase 1 (close the trace gap) + +These are the precise seams to touch. Verify each before editing. + +**Inject context (Go):** +- `internal/queue/queue.go` — `RedisQueue.Enqueue` builds the stream message + (a `map`); the worker reads its fields. Inject `traceparent` (and `tracestate`) + here using the OTel propagator, as top-level message field(s). +- `internal/outbox/relay.go` — `tick()` unmarshals the outbox row payload and calls + `queue.Enqueue(ctx, payload)`. Because enqueue is **store-and-forward**, the + trace context must survive in the **outbox row** too: capture it when the row is + written in `internal/service/asset.go` (`MarkAssetUploaded`, the + `outboxRepo.InsertTx` call), persist it (extend `internal/models/outbox.go` + + `internal/repository/outbox_repo.go` + a migration), and re-inject on relay. +- **Verify the global propagator is set** in `internal/metrics/otel.go` + (`otel.SetTextMapPropagator(propagation.TraceContext{})`). If missing, add it — + without it, injection is a no-op. + +**Extract + continue (Python):** +- Add `worker/utils/tracing.py` mirroring `worker/utils/metrics.py` (tracer init, + OTLP exporter to the same endpoint). Find where `init_metrics(...)` is called and + init the tracer alongside it (same lifecycle). +- `worker/consumer/consumer.py` — in `consume()`, after the message payload is + normalized (note: a `body` field, if present, is JSON-decoded and merged), read + `traceparent` and start the consumer span. Use a **child span with a link** to + the producer context (link is the correct primitive for queue fan-in; child span + keeps the Tempo waterfall readable). + +**Span the stages (Phase 2):** +- `worker/processing/processor.py` — `process_asset_dispatch` (download, dedup check). +- `worker/processing/images.py` — per-variant encode/upload. +- `worker/processing/videos.py` — `run()` wraps each ffmpeg call (poster / transcode_720p / preview). +- Stamp `trace_id`/`span_id` into worker + API structured logs for Loki↔Tempo linking. + +**Message format reminder:** the consumer accepts either `job_id` (canonical) or +`asset_id`. The outbox payload (built in `asset.go`) currently carries `job_id`, +`asset_id`, `event`, `timestamp`. Add trace context as additional field(s); don't +break the existing keys. + +--- + +## 5. Environment & topology facts (host = macOS) + +**Host ports → containers:** +| Service | Host | Container | Notes | +|---|---|---|---| +| API | 5010 | 5010 | `/healthz`, `/api/v1/...` | +| Postgres | 5433 | 5432 | user `mpiper`, db `mpiper`, pw `changeme` | +| Redis | 6380 | 6379 | stream `media:jobs`, group `worker-group` | +| MinIO API | 9000 | 9000 | bucket `mpiper` (anon download on) | +| MinIO console | 9001 | 9001 | minioadmin / minioadmin | +| Grafana | 3000 | 3000 | anon admin | +| Prometheus | 9090 | 9090 | | +| Tempo | 3200 | 3200 | OTLP in on 4317/4318 (obs net) | +| webhook-receiver | 8888 | 8080 | overlay only | + +**Container names:** `mpiper-api`, `mpiper-worker`, `mpiper-postgres`, +`mpiper-redis`, `mpiper-minio`, `mpiper-webhook-receiver`, `mpiper-otel-collector`, +`mpiper-tempo`, `mpiper-prometheus`, `mpiper-grafana`, `mpiper-loki`. + +**Storage split endpoint (implemented):** `S3_ENDPOINT_URL=http://minio:9000` +(internal I/O) vs `S3_PUBLIC_ENDPOINT_URL=http://localhost:9000` (presigned + +public URLs). Don't undo this — host-run load tests depend on it. + +**Telemetry env (`.env.local`):** `OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317`, +`OTEL_TLS_INSECURE=true`, `TRACE_SAMPLING_RATE` (default 0.1 in code — **set to +1.0 locally** so every asset traces). `ENCRYPTION_KEY=0123456789abcdef0123456789abcdef` +(32 bytes; used for auth tokens AND webhook secrets). + +--- + +## 6. Runbook / command cheat sheet + +```bash +# Bring up core + observability (+ webhooks if you want webhook traces too) +docker compose -f docker-compose.yml -f docker-compose.observability.yml up -d --build +# add: -f docker-compose.webhooks.yml (for webhook receiver) + +# End-to-end smoke (host-run; image + video + webhooks; 23 checks) +./scripts/demo-e2e.sh + +# Go: build / vet / tests (tests/performance_suite_test.go FAILS unless PERF_TEST_URL set — ignore) +go build ./... && go vet ./... && go test ./... + +# Worker tests: the local .venv (py3.14) LACKS psycopg_pool/pytest/cryptography. +# Run them INSIDE the worker container instead: +docker exec -w /app mpiper-worker python -m unittest discover -s worker/tests -p 'test_*.py' -v + +# Mint an auth token from the host (system python3 has `cryptography`; venv does not): +TOKEN=$(python3 - <<'PY' +import base64, os +from cryptography.hazmat.primitives.ciphers.aead import AESGCM +key=b"0123456789abcdef0123456789abcdef"; nonce=os.urandom(12) +print(base64.urlsafe_b64encode(nonce+AESGCM(key).encrypt(nonce,b"demo-user",None)).rstrip(b"=").decode()) +PY +) + +# Inspect DB +docker exec mpiper-postgres psql -U mpiper -d mpiper -c "SELECT asset_id,status,type FROM assets ORDER BY created_at DESC LIMIT 5;" + +# Reset all state (assets/variants/objects accumulate across runs) +docker compose -f docker-compose.yml -f docker-compose.observability.yml down -v + +# UIs: Grafana http://localhost:3000 · Prometheus http://localhost:9090 · Tempo via Grafana Explore +``` + +--- + +## 7. Landmines (things that already bit, or will) + +- **Worker is single-threaded.** `MAX_CONCURRENT_JOBS` is in `worker/consumer/ + config.py` but **never used**; `consume()` does `count=1` and processes inline. + This is the expected bottleneck Phase 5 should prove — don't "fix" it in Track 3. +- **Recovery is homegrown.** A 2-min DB scan re-`XADD`s stale jobs; no + `XPENDING`/`XAUTOCLAIM`; poison messages are marked `failed` and dropped (no DLQ). + That's Track 1, not Track 3. +- **Global propagator may be unset** in Go — injection silently no-ops without it. Check first. +- **Sampling.** Code default `TRACE_SAMPLING_RATE=0.1`. Set 1.0 locally or you'll + lose most traces and think propagation is broken. +- **Dedup hides work.** Identical fixtures dedup after the first asset → near-zero + work on repeats. The load harness must **fan out unique bytes** to measure real + per-job cost. +- **Cardinality.** asset_id is fine as a *trace/span attribute*; **never** put it on + a *metric* label. +- **Health check.** `cmd/server --health-check` is now a real `/healthz` probe + (was previously booting a second server and failing to bind 5010 → api unhealthy + → worker wouldn't start). If you change startup, keep that path lightweight. +- **Rebuild after code changes.** api/worker run from built images: + `docker compose ... build api worker && docker compose ... up -d`. +- **Local ≠ prod.** Trust bottleneck *location* and before/after deltas, not + absolute throughput numbers. + +--- + +## 8. Suggested first-session scope + +Do **Phase 0 + Phase 1** together (highest value, gets a real cross-boundary trace fast): + +1. **Phase 0:** add `deploy.resources.limits` (cpu/mem) to `api` + `worker` in a + compose overlay; set `TRACE_SAMPLING_RATE=1.0`; bring up with the observability + overlay; capture a baseline `demo-e2e.sh` run and confirm spans land in Tempo. +2. **Phase 1:** Go `traceparent` injection (enqueue + outbox row + migration) → + worker tracer + extraction in `consume()`. + +**Acceptance:** open one asset in Grafana/Tempo and see a single trace from +`POST /storage/presign` through `enqueue` → (visible queue-wait gap) → worker +`consume` span. That alone is a satisfying, demoable win. + +Then continue with Phases 2–5 from the design doc (pipeline spans + log +correlation → SLO recording rules + dashboards → k6 harness → first experiment +writeup that names the worker bottleneck, feeding Track 1). + +--- + +## 9. Repo / git state at handoff + +- Branch: `feat/webhook-notifications`; open **PR #18**. +- Demo-readiness + split-endpoint work is committed (`9404c7a`) and pushed. +- `docs/enhancements/` (this file + `README.md` + `track-03-observability-and-load.md`) + may be **uncommitted** — commit them at the start of the Track 3 session. +- Key docs to read: `docs/enhancements/README.md` (catalog), + `track-03-observability-and-load.md` (plan), `docs/arch/*` (existing outbox/ + reliability design notes), `CLAUDE.md` (repo conventions; note the stale worker- + telemetry line). diff --git a/docs/enhancements/track-03-observability-and-load.md b/docs/enhancements/track-03-observability-and-load.md new file mode 100644 index 0000000..43ff9c9 --- /dev/null +++ b/docs/enhancements/track-03-observability-and-load.md @@ -0,0 +1,214 @@ +# Track 3 — End-to-end tracing, SLOs & local load testing + +**Status:** planning · **Prereq:** none · **Unlocks:** makes every other track measurable + +## 1. Problem + +We can't improve what we can't see. Right now: + +- The **distributed trace breaks at the Redis boundary.** The Go API traces the + HTTP request and the `Enqueue` call, but it never injects a `traceparent` into + the stream message. The worker has OTel **metrics** but **no tracer** and does + no context extraction. So we cannot answer "for *this* asset, where did the 40 + seconds go?" as a single trace spanning API → outbox → Redis → worker → ffmpeg → + variant write. +- We have metrics but **no SLOs** — no agreed definition of "good", so no way to + say whether a change helped. +- We have **no way to generate controlled load**, so we've never seen the system + bend. The single-threaded worker (Track 1) is an invisible bottleneck until + something pushes on it. + +The user's real question: *this is a local project — how do we test under load +and actually understand what's working, failing, and needs optimization?* + +That question is answered in §3. + +## 2. Goals / Non-goals + +**Goals** +- One trace per asset, end to end, across the queue boundary, viewable in Tempo. +- A small, explicit set of **SLIs and SLOs** for the pipeline. +- A repeatable **local load harness** that can saturate the system on a laptop. +- Grafana dashboards (RED for the API, USE for the worker/host, a pipeline-latency + funnel, queue lag) wired so a metric spike links to an example trace (exemplars). +- A written **bottleneck-analysis loop**: load → observe → locate → optimize → re-run → compare. + +**Non-goals** +- Production-scale absolute numbers. Local results are **relative** — they reveal + bottlenecks and validate *direction*, not real-world capacity (see §7). +- Alerting/paging infrastructure (note SLO burn-rate alerts as a follow-up). +- Replacing the existing stack — we extend the bundled Tempo/Prometheus/Loki/Grafana. + +## 3. Can you load-test meaningfully on a laptop? Yes — here's the methodology + +The misconception is that load testing requires cloud scale. It doesn't. Load +testing is about **saturating the system relative to its own capacity** and +watching where it bends. A single-threaded worker on a laptop saturates at a +handful of concurrent jobs — you can absolutely push it past that locally. + +The thing that makes local results *interpretable* is **pinning resources** so +runs are reproducible and the bottleneck isn't hidden by spare laptop cores. We +add CPU/memory limits to the `api` and `worker` containers (compose `deploy. +resources.limits`) so "the worker is the bottleneck" is a stable, observable fact +rather than something that moves run to run. + +**The loop we're building:** + +``` + ┌─────────────────────────────────────────────┐ + │ 1. Define SLIs/SLOs (what "good" means) │ + └───────────────┬─────────────────────────────┘ + ▼ + ┌─────────────────────────────────────────────┐ + │ 2. Instrument end-to-end (close the trace gap)│ + └───────────────┬─────────────────────────────┘ + ▼ + ┌────────────┐ generate ┌─────────────────────────┐ + │ k6 (host) │ ────────────▶ │ MPiper (CPU-pinned) │ + │ load model │ presign→PUT │ API + worker │ + └────────────┘ →complete └───────────┬─────────────┘ + │ client-side metrics │ app OTel traces+metrics + ▼ ▼ + ┌─────────────────────────────────────────────────────┐ + │ 3. Observe in Grafana: RED, USE, pipeline funnel, │ + │ queue lag — metric spike → exemplar trace in Tempo │ + └───────────────┬─────────────────────────────────────┘ + ▼ + ┌─────────────────────────────────────────────────────┐ + │ 4. Locate bottleneck (trace waterfall + USE) → │ + │ optimize → re-run same profile → compare │ + └─────────────────────────────────────────────────────┘ +``` + +### Load model (this is the subtle part) + +- **Closed model (fixed VUs):** N virtual users each loop presign→upload→complete + as fast as they can. Good for finding max throughput and saturation point. +- **Open model (fixed arrival rate):** X new uploads/sec regardless of how fast the + system responds. Good for finding the **latency knee** and watching queue lag + grow when arrival rate > service rate (a live demonstration of Little's Law: + `L = λW`). + +We use **k6** run from the **host** (like `scripts/demo-e2e.sh`): the host can +reach both the API (`localhost:5010`) and MinIO (`localhost:9000`), so k6 performs +the *real* client flow — presign, `PUT` the file to the public endpoint, then +`complete`. k6 uploads real fixtures (the existing image + `tests/test_assets/ +sample.mp4`), optionally fanning out copies with unique bytes to defeat content-hash +dedup when we want true per-job work. + +Two views of the same run: +- **Client view** (k6's own metrics): request rate, error rate, client-side + latency percentiles → remote-written to the bundled Prometheus. +- **Server view** (MPiper's OTel): the pipeline's internal spans and metrics — + this is the point of the track, and what we'll mostly read. + +## 4. Design + +### 4.1 Close the trace gap (the core engineering work) + +1. **Inject context on enqueue (Go).** When the outbox relay (or `RedisQueue. + Enqueue`) publishes, inject the active span context as a `traceparent` field in + the stream message using the OTel propagator. The outbox row should carry the + trace context too (so the trace survives the store-and-forward hop). +2. **Extract + continue on consume (Python).** Add an OTel **tracer** to the worker + (mirroring `worker/utils/metrics.py`). In `consume()`, extract `traceparent` + from the message and start the consumer span as a **child** (a span link is the + correct primitive for queue fan-in; we'll use a child span with a link to keep + the waterfall readable). +3. **Span the pipeline stages.** Wrap `process_asset_dispatch`, download, + dedup-check, each image variant, and each ffmpeg invocation (poster / transcode / + preview) in spans with attributes (asset_id, type, bytes, role, ffmpeg rc). +4. **Correlate logs.** Stamp `trace_id`/`span_id` into worker + API structured logs + so Loki ↔ Tempo cross-linking works in Grafana. + +End result: open an asset in Tempo and see `HTTP POST /presign … → enqueue → +(time in queue) → worker consume → download → transcode_720p → write variant`, +with the **queue wait time** visible as the gap between enqueue and consume. + +### 4.2 SLIs / SLOs (initial, deliberately small) + +| SLI | Definition | Initial SLO (local) | +|-----|------------|---------------------| +| Presign latency | p95 of `POST /storage/presign` | < 150 ms | +| Image ready latency | p95 (complete → asset `ready`) for images | < 5 s | +| Video ready latency | p95 (complete → asset `ready`) for videos | < 60 s | +| Queue wait | p95 (enqueue → consume start) | < 2 s | +| Job success rate | done / (done + failed) | > 99% | +| Webhook delivery latency | p95 (event row created → delivered) | < 10 s | + +These come straight from spans/metrics we'll have. The numbers are starting +guesses; the *point* is to make them explicit, then move them based on data. + +### 4.3 Dashboards (Grafana, provisioned in `observability/grafana/dashboards`) + +- **API — RED:** request **R**ate, **E**rror rate, **D**uration (p50/p95/p99) per route. +- **Worker/host — USE:** CPU/mem **U**tilization, **S**aturation (queue depth, + in-flight jobs), **E**rrors. (cAdvisor/node metrics or the collector's own.) +- **Pipeline funnel:** uploaded → processing → ready/failed counts + the + per-stage latency breakdown (from spans). +- **Queue health:** stream length, oldest-pending age, outbox relay lag (metric + already exists), webhook pending gauge (already exists). +- **Exemplars:** histogram panels link a bucket spike to a concrete Tempo trace. + +### 4.4 Bottleneck-analysis loop (documented runbook) + +For each experiment: fix a load profile, run it, then read in order — (1) is the +SLO breached? (2) USE: is the worker CPU-saturated or queue-saturated? (3) open an +exemplar trace: which span dominates? (4) form a hypothesis, change one thing, +re-run the **same** profile, compare. Record results in an `experiments/` log so +"the transcode span dropped from 38s→6s after X" is captured. + +## 5. Phased implementation plan + +Each phase is independently demoable. + +- **Phase 0 — Resource pinning & baseline.** Add `deploy.resources.limits` to api/ + worker; bring up the observability overlay; capture a one-shot baseline with the + existing `demo-e2e.sh`. *Demo:* Grafana shows the run; numbers are reproducible. +- **Phase 1 — Trace propagation.** Inject `traceparent` on enqueue (Go) + outbox + row; extract + continue in the worker; add the worker tracer. *Demo:* a single + Tempo trace spans API→worker for one asset, with visible queue wait. +- **Phase 2 — Pipeline spans + log correlation.** Span dispatch/download/dedup/ + each variant/each ffmpeg call; add trace IDs to logs. *Demo:* trace waterfall + shows per-stage timing; click a log line → its trace. +- **Phase 3 — SLO recording rules + dashboards.** Prometheus recording rules for + the SLIs in §4.2; provision the four dashboards. *Demo:* a dashboard shows each + SLI vs its SLO target. +- **Phase 4 — k6 load harness.** `loadtest/` with closed- and open-model scripts, + a host-run wrapper, fixture fan-out, and k6→Prometheus remote write. *Demo:* + `./loadtest/run.sh open --rate 5/s --duration 3m` drives the system; Grafana + shows queue lag climbing and the latency knee. +- **Phase 5 — First experiment writeup.** Run a saturating profile, capture the + bottleneck (expected: the single-threaded worker), and write it up as the + motivating evidence for **Track 1**. *Demo:* `experiments/0001-worker-saturation.md` + with before numbers + the trace proving where time goes. + +## 6. How we'll know it works (acceptance) + +- A Tempo trace for any asset includes both API and worker spans, with queue wait + time visible. +- Every SLI in §4.2 renders on a dashboard against its target. +- `loadtest/run.sh` reproducibly drives the system into SLO breach, and the + responsible stage is identifiable from a trace within ~2 minutes of looking. +- Phase 5 writeup names the bottleneck with evidence — the input to Track 1. + +## 7. Risks & honest caveats + +- **Local ≠ production.** Absolute numbers are not portable (laptop CPU, no network + latency, single-node Redis/PG). Treat results as **relative**: bottleneck + location and before/after deltas are trustworthy; "we do N uploads/sec" is not. +- **Noisy neighbor.** k6, the app, and the observability stack share the laptop. + Pin app resources and keep k6 modest; consider running k6 with `--throw` budgets. +- **Container CPU limits change behavior** (e.g. ffmpeg threads). That's fine — it's + what makes runs comparable — but document the limits with each experiment. +- **Trace cardinality / sampling.** Asset-ID attributes are high-cardinality on + *traces* (OK) but must never become metric labels. Keep `TRACE_SAMPLING_RATE` + in mind; sample at 100% locally, lower in prod. +- **Dedup hides work.** Identical fixtures dedup after first processing; the load + harness must fan out unique bytes when measuring real per-job cost. + +## 8. Follow-ups (out of scope here) + +- SLO **burn-rate alerting** (multi-window) once SLOs stabilize. +- Continuous profiling (Pyroscope) to attribute CPU *inside* a span. +- CI smoke load test with a latency budget (feeds Track 8). diff --git a/experiments/0001-worker-saturation.md b/experiments/0001-worker-saturation.md new file mode 100644 index 0000000..8214ee6 --- /dev/null +++ b/experiments/0001-worker-saturation.md @@ -0,0 +1,117 @@ +# Experiment 0001 — Worker saturation under load + +**Date:** 2026-06-30 · **Track:** 3 (observability & load) · **Feeds:** Track 1 (concurrent worker) +**Status:** complete + +## Hypothesis + +The Python worker is single-threaded (`consume()` reads one message, `count=1`, +and processes it inline; `MAX_CONCURRENT_JOBS` exists in config but is unused). +Under sustained upload load the worker — not the API — should be the bottleneck, +with the Redis stream growing without bound once arrival rate exceeds the +worker's service rate (Little's Law, `L = λW`). + +## Setup (record this with every run) + +- **Resource pinning** (`docker-compose.loadtest.yml`): `api` = 1.0 CPU / 512 MB, + `worker` = **1.0 CPU** / 1 GB. The single-CPU pin makes the bottleneck a stable, + observable fact rather than something that moves with spare laptop cores. +- **Sampling:** `TRACE_SAMPLING_RATE=1.0` (every asset traced). +- **Stack:** core + observability + loadtest overlays, all up. +- **Workload:** images only, unique bytes per iteration (dedup defeated — see + `loadtest/lib.js`). Fixture `worker/tests/test_assets/image.jpg`, 3 webp + variants per asset. +- **Profile:** open model, `./loadtest/run.sh open --rate 10/s --duration 60s` + (fixed arrival rate; λ = 10 uploads/s). + +> Local results are **relative**. Trust the bottleneck location and the +> before/after deltas, not the absolute throughput — laptop CPU, no network +> latency, single-node Redis/Postgres. + +## Method (the loop) + +1. Is an SLO breached? 2. USE — is the worker CPU- or queue-saturated? +3. Open an exemplar trace — which span dominates? 4. Form a hypothesis, change +one thing, re-run the **same** profile, compare. + +## Results (before — no optimisation yet) + +| Signal | Value | Source | +|--------|-------|--------| +| Arrival rate (λ) | 10.0 uploads/s | k6 `mpiper_assets_submitted` | +| Worker service rate (μ) | **1.13 jobs/s** | `rate(mpiper_mpiper_job_processing_success_total[2m])` | +| Mean asset processing time | 0.81 s | `…_duration_seconds_sum / …_count` | +| Queue depth before → after | 3985 → 4370 (↑) | `sli:queue_depth:current` | +| Worker CPU | **98.5 %** (pinned at 1 CPU) | `docker stats` | +| API CPU | 0.4 % | `docker stats` | +| Presign p95 (API) | 48 ms (SLO < 150 ms ✅) | `sli:presign_latency_seconds:p95` | +| Job success rate | 1.0 (SLO > 99 % ✅) | `sli:job_success_ratio:ratio_rate5m` | + +**Reading:** λ (10/s) ≫ μ (1.13/s). The queue grows monotonically; the system is +**unstable for any arrival rate above ≈ 1.1 uploads/s**. The API is essentially +idle (0.4 % CPU, presign well inside SLO) while the worker is pinned at 98.5 %. +The bottleneck is unambiguously the worker, and specifically its +**single-threaded, one-job-at-a-time** processing loop — not CPU work that is +inherently slow (a single image is ~0.8 s), but the complete absence of +concurrency. + +## Trace evidence (where the time goes) + +With the trace gap now closed (Track 3, Phase 1), one asset is a single trace +from the API through the queue into the worker — example, 19 spans: + +``` +/api/v1/assets/{id}/complete (API HTTP request) +└ AssetHandler.MarkAssetUploaded + └ AssetService.MarkAssetUploaded + ├ StorageClient.GetObjectAttrs → S3.GetObjectAttrs + └ Database.Transaction + ├ AssetRepo.MarkAssetUploadedTx + ├ AssetRepo.InsertProcessAssetJobTx + └ OutboxRepo.InsertTx + └ outbox.publish (relay re-activates stored context) + └ RedisQueue.Enqueue (injects traceparent into the message) + ├ RedisQueue.doXAddWithRetry + └ worker.consume (── crosses the Redis boundary ──) + └ process.dispatch + ├ process.download + ├ process.dedup_check + └ image.variant × 3 +``` + +The **gap between `RedisQueue.Enqueue` and `worker.consume`** is the queue wait — +the time an asset spends backed up behind the single worker. Under this profile +that gap dominates end-to-end latency, and it grows for every asset because the +backlog only ever increases. The in-worker stages (download, dedup, 3 variants) +are individually fast; the cost is waiting for a free worker, not the work itself. + +## Conclusion + +The single-threaded worker is the bottleneck, with a service rate of ~1.1 +jobs/s. The pipeline cannot keep up with anything beyond a trickle of uploads, +and the deficit manifests as an unbounded Redis backlog and ever-growing +queue-wait latency — while the API and host CPU sit idle. This is the motivating +evidence for **Track 1 (concurrent worker + stream recovery + DLQ)**: honour +`MAX_CONCURRENT_JOBS` as a real concurrency limit (process pool for the +CPU-bound Pillow/ffmpeg work) so μ scales with available cores instead of being +fixed at one. + +## Reproduce + +```bash +docker compose -f docker-compose.yml -f docker-compose.observability.yml \ + -f docker-compose.loadtest.yml up -d --build +./loadtest/run.sh open --rate 10/s --duration 60s +# Grafana http://localhost:3000 → MPiper folder: +# - "Worker / App Saturation (USE)": queue depth climbing, in-flight pinned +# - "Pipeline Funnel": ready/s flat at ~1.1 while uploaded/s tracks arrival +# Tempo (Explore): TraceQL `{ name="worker.consume" }` → open one → see the +# enqueue→consume queue-wait gap and the per-stage breakdown. +``` + +## Next experiment + +After Track 1 lands a bounded worker pool, re-run this **exact** profile and +compare: μ should rise roughly with the pool size (until CPU-bound), queue depth +should stabilise instead of growing, and the enqueue→consume gap should shrink. +Record results as `0002-concurrent-worker.md`. diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 054436e..7aaf5f6 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -2,6 +2,7 @@ package metrics import ( "context" + "database/sql" "runtime" "time" @@ -40,13 +41,16 @@ type Metrics struct { StorageOperationTotal metric.Int64Counter StorageOperationErrors metric.Int64Counter - DBQueryDuration metric.Float64Histogram - DBQueryTotal metric.Int64Counter - DBQueryErrors metric.Int64Counter - DBConnectionsActive metric.Int64UpDownCounter - DBConnectionsIdle metric.Int64UpDownCounter - DBTransactionTotal metric.Int64Counter - DBTransactionErrors metric.Int64Counter + DBQueryDuration metric.Float64Histogram + DBQueryTotal metric.Int64Counter + DBQueryErrors metric.Int64Counter + DBConnectionsActive metric.Int64ObservableGauge + DBConnectionsIdle metric.Int64ObservableGauge + DBConnectionsOpen metric.Int64ObservableGauge + DBConnectionsMaxOpen metric.Int64ObservableGauge + DBConnectionsWaitCount metric.Int64ObservableGauge + DBTransactionTotal metric.Int64Counter + DBTransactionErrors metric.Int64Counter QueueMessagePublished metric.Int64Counter QueueMessageConsumed metric.Int64Counter @@ -83,6 +87,22 @@ func (m *Metrics) RegisterQueueDepthFunc(fn func(context.Context) (int64, error) return err } +// RegisterDBStatsFunc wires sql.DBStats (connection-pool stats) to the DB +// connection gauges. fn typically returns db.Stats(). One callback observes all +// pool gauges from a single stats snapshot so they stay mutually consistent. +func (m *Metrics) RegisterDBStatsFunc(fn func() sql.DBStats) error { + _, err := m.meter.RegisterCallback(func(_ context.Context, o metric.Observer) error { + s := fn() + o.ObserveInt64(m.DBConnectionsActive, int64(s.InUse)) + o.ObserveInt64(m.DBConnectionsIdle, int64(s.Idle)) + o.ObserveInt64(m.DBConnectionsOpen, int64(s.OpenConnections)) + o.ObserveInt64(m.DBConnectionsMaxOpen, int64(s.MaxOpenConnections)) + o.ObserveInt64(m.DBConnectionsWaitCount, s.WaitCount) + return nil + }, m.DBConnectionsActive, m.DBConnectionsIdle, m.DBConnectionsOpen, m.DBConnectionsMaxOpen, m.DBConnectionsWaitCount) + return err +} + // RegisterOutboxPendingFunc wires a callback to the OutboxPendingGauge. func (m *Metrics) RegisterOutboxPendingFunc(fn func(context.Context) (int64, error)) error { _, err := m.meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { @@ -146,7 +166,9 @@ func InitMetrics(ctx context.Context, logger *zap.Logger) (*Metrics, func(contex mp := sdkmetric.NewMeterProvider( sdkmetric.WithResource(res), - sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exp)), + // 15s export interval (matches the worker) so RED/SLO dashboards stay + // responsive under load; the SDK default of 60s lags the 5m rate window. + sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exp, sdkmetric.WithInterval(15*time.Second))), sdkmetric.WithView( sdkmetric.NewView( sdkmetric.Instrument{Name: "http.server.request.duration", Kind: sdkmetric.InstrumentKindHistogram}, @@ -157,6 +179,18 @@ func InitMetrics(ctx context.Context, logger *zap.Logger) (*Metrics, func(contex }, ), ), + sdkmetric.WithView( + // Sub-second-resolution buckets for the queue lag histogram; the + // default buckets are too coarse and inflate the queue-wait SLI. + sdkmetric.NewView( + sdkmetric.Instrument{Name: "queue.processing.lag", Kind: sdkmetric.InstrumentKindHistogram}, + sdkmetric.Stream{ + Aggregation: sdkmetric.AggregationExplicitBucketHistogram{ + Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5}, + }, + }, + ), + ), ) otel.SetMeterProvider(mp) @@ -280,16 +314,31 @@ func initDatabaseMetrics(m *Metrics, meter metric.Meter, logger *zap.Logger) { if err != nil { logger.Sugar().Fatalf("Failed to create DB query errors: %v", err) } - m.DBConnectionsActive, err = meter.Int64UpDownCounter("db.connections.active", - metric.WithDescription("Number of active database connections"), metric.WithUnit("{connection}")) + m.DBConnectionsActive, err = meter.Int64ObservableGauge("db.connections.active", + metric.WithDescription("Number of in-use database connections"), metric.WithUnit("{connection}")) if err != nil { logger.Sugar().Fatalf("Failed to create DB active connections: %v", err) } - m.DBConnectionsIdle, err = meter.Int64UpDownCounter("db.connections.idle", + m.DBConnectionsIdle, err = meter.Int64ObservableGauge("db.connections.idle", metric.WithDescription("Number of idle database connections"), metric.WithUnit("{connection}")) if err != nil { logger.Sugar().Fatalf("Failed to create DB idle connections: %v", err) } + m.DBConnectionsOpen, err = meter.Int64ObservableGauge("db.connections.open", + metric.WithDescription("Number of open database connections (in-use + idle)"), metric.WithUnit("{connection}")) + if err != nil { + logger.Sugar().Fatalf("Failed to create DB open connections: %v", err) + } + m.DBConnectionsMaxOpen, err = meter.Int64ObservableGauge("db.connections.max_open", + metric.WithDescription("Configured max open database connections (0 = unlimited)"), metric.WithUnit("{connection}")) + if err != nil { + logger.Sugar().Fatalf("Failed to create DB max open connections: %v", err) + } + m.DBConnectionsWaitCount, err = meter.Int64ObservableGauge("db.connections.wait_count", + metric.WithDescription("Cumulative count of connection waits (pool contention)"), metric.WithUnit("{wait}")) + if err != nil { + logger.Sugar().Fatalf("Failed to create DB wait count: %v", err) + } m.DBTransactionTotal, err = meter.Int64Counter("db.transaction.total", metric.WithDescription("Total number of database transactions"), metric.WithUnit("{transaction}")) if err != nil { diff --git a/internal/middleware/logging.go b/internal/middleware/logging.go index c046ab2..7c453be 100644 --- a/internal/middleware/logging.go +++ b/internal/middleware/logging.go @@ -37,6 +37,11 @@ func LoggerMiddleware(l *zap.Logger) func(next http.Handler) http.Handler { zap.String("proto", r.Proto), ) + // Stamp trace_id/span_id from the active span (TracingMiddleware runs + // before this) so request logs cross-link to their Tempo trace and + // any handler/service using the context logger inherits the IDs. + reqLogger = applogger.WithTrace(r.Context(), reqLogger) + ctx := applogger.WithLogger(r.Context(), reqLogger) r = r.WithContext(ctx) diff --git a/internal/middleware/metrics.go b/internal/middleware/metrics.go index 5e02ae4..542c9be 100644 --- a/internal/middleware/metrics.go +++ b/internal/middleware/metrics.go @@ -32,43 +32,51 @@ func MetricsMiddleware(m *metrics.Metrics) func(http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { start := time.Now() - route := chi.RouteContext(r.Context()).RoutePattern() - if route == "" { - route = "unknown" - } - wrapped := &metricsResponseWriter{ResponseWriter: w, statusCode: http.StatusOK} - attrs := []attribute.KeyValue{ - attribute.String("http.method", r.Method), - attribute.String("http.route", route), - } + // In-flight gauge: keyed by method only. The route pattern is not yet + // known here (chi populates it during routing, after this middleware), + // so it would be "unknown"; using it on +1/-1 still nets to zero but + // adds no value. Per-route labels are applied post-routing below. + inflightAttrs := []attribute.KeyValue{attribute.String("http.method", r.Method)} if m != nil { - m.HTTPActiveRequests.Add(r.Context(), 1, metric.WithAttributes(attrs...)) - defer m.HTTPActiveRequests.Add(r.Context(), -1, metric.WithAttributes(attrs...)) + m.HTTPActiveRequests.Add(r.Context(), 1, metric.WithAttributes(inflightAttrs...)) + defer m.HTTPActiveRequests.Add(r.Context(), -1, metric.WithAttributes(inflightAttrs...)) } defer func() { if rec := recover(); rec != nil { wrapped.statusCode = http.StatusInternalServerError - recordHTTPMetrics(m, r, wrapped, start, attrs) + recordHTTPMetrics(m, r, wrapped, start) panic(rec) } }() next.ServeHTTP(wrapped, r) - recordHTTPMetrics(m, r, wrapped, start, attrs) + recordHTTPMetrics(m, r, wrapped, start) }) } } -func recordHTTPMetrics(m *metrics.Metrics, r *http.Request, w *metricsResponseWriter, start time.Time, baseAttrs []attribute.KeyValue) { +func recordHTTPMetrics(m *metrics.Metrics, r *http.Request, w *metricsResponseWriter, start time.Time) { if m == nil { return } + // chi populates the matched route pattern during routing, so it is only + // available now (after ServeHTTP). Reading it earlier yields "" — the source + // of the previous "unknown" http_route label that broke per-route SLOs. + route := chi.RouteContext(r.Context()).RoutePattern() + if route == "" { + route = "unknown" + } + duration := time.Since(start).Seconds() - attrs := append(baseAttrs, attribute.Int("http.status_code", w.statusCode)) + attrs := []attribute.KeyValue{ + attribute.String("http.method", r.Method), + attribute.String("http.route", route), + attribute.Int("http.status_code", w.statusCode), + } m.HTTPRequestDuration.Record(r.Context(), duration, metric.WithAttributes(attrs...)) m.HTTPRequestCount.Add(r.Context(), 1, metric.WithAttributes(attrs...)) diff --git a/internal/models/outbox.go b/internal/models/outbox.go index 297883c..58647d8 100644 --- a/internal/models/outbox.go +++ b/internal/models/outbox.go @@ -13,6 +13,12 @@ type OutboxEvent struct { JobID *int64 `db:"job_id"` Event string `db:"event"` Payload json.RawMessage `db:"payload"` + // Traceparent carries the W3C trace context captured when the row was + // written, so the distributed trace survives the outbox store-and-forward + // hop. The relay re-activates it before publishing to Redis. Nullable: + // rows written before this column existed (or without an active span) have + // no trace context. + Traceparent *string `db:"traceparent"` Status string `db:"status"` Attempts int `db:"attempts"` MaxAttempts int `db:"max_attempts"` diff --git a/internal/outbox/relay.go b/internal/outbox/relay.go index ae485b5..6e2f8a8 100644 --- a/internal/outbox/relay.go +++ b/internal/outbox/relay.go @@ -8,6 +8,11 @@ import ( "github.com/rndmcodeguy20/mpiper/internal/metrics" "github.com/rndmcodeguy20/mpiper/internal/queue" "github.com/rndmcodeguy20/mpiper/internal/repository" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/trace" "go.uber.org/zap" ) @@ -17,12 +22,13 @@ type Relay struct { queue queue.Queue logger *zap.Logger m *metrics.Metrics + tracer trace.Tracer interval time.Duration batch int } func NewRelay(repo repository.OutboxRepository, q queue.Queue, logger *zap.Logger, m *metrics.Metrics, interval time.Duration, batch int) *Relay { - return &Relay{repo: repo, queue: q, logger: logger, m: m, interval: interval, batch: batch} + return &Relay{repo: repo, queue: q, logger: logger, m: m, tracer: otel.Tracer("mpiper-api"), interval: interval, batch: batch} } // Start runs the relay loop until ctx is cancelled. It finishes the in-flight batch before returning. @@ -71,8 +77,26 @@ func (r *Relay) tick(ctx context.Context) { continue } - if _, err := r.queue.Enqueue(ctx, payload); err != nil { + // Re-activate the producer's trace context (captured when the row was + // written) so the publish + enqueue spans rejoin the original request + // trace instead of starting a disconnected root. tick() runs on a + // background ticker context, so without this the trace would break here. + publishCtx := ctx + if row.Traceparent != nil && *row.Traceparent != "" { + carrier := propagation.MapCarrier{"traceparent": *row.Traceparent} + publishCtx = otel.GetTextMapPropagator().Extract(ctx, carrier) + } + publishCtx, span := r.tracer.Start(publishCtx, "outbox.publish") + span.SetAttributes( + attribute.Int64("outbox.row_id", row.ID), + attribute.String("event", row.Event), + ) + + if _, err := r.queue.Enqueue(publishCtx, payload); err != nil { r.logger.Warn("outbox relay: enqueue failed", zap.Int64("id", row.ID), zap.Error(err)) + span.RecordError(err) + span.SetStatus(codes.Error, "enqueue failed") + span.End() _ = r.repo.IncrementAttempts(ctx, row.ID, err.Error()) if row.Attempts+1 >= row.MaxAttempts { _ = r.repo.MarkFailed(ctx, row.ID, err.Error()) @@ -82,6 +106,7 @@ func (r *Relay) tick(ctx context.Context) { } continue } + span.End() publishedIDs = append(publishedIDs, row.ID) } diff --git a/internal/outbox/relay_trace_test.go b/internal/outbox/relay_trace_test.go new file mode 100644 index 0000000..9fc7f54 --- /dev/null +++ b/internal/outbox/relay_trace_test.go @@ -0,0 +1,124 @@ +package outbox + +import ( + "context" + "database/sql" + "encoding/json" + "testing" + "time" + + "github.com/google/uuid" + "github.com/rndmcodeguy20/mpiper/internal/models" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/trace" + "go.uber.org/zap" +) + +// fakeOutboxRepo is an in-memory OutboxRepository for white-box relay tests. +type fakeOutboxRepo struct { + pending []models.OutboxEvent + published []int64 + incremented []int64 + failed []int64 +} + +func (f *fakeOutboxRepo) InsertTx(_ context.Context, _ *sql.Tx, _ models.OutboxEvent) error { + return nil +} +func (f *fakeOutboxRepo) FetchPendingBatch(_ context.Context, _ int) ([]models.OutboxEvent, error) { + out := f.pending + f.pending = nil // single tick + return out, nil +} +func (f *fakeOutboxRepo) MarkPublished(_ context.Context, ids []int64) error { + f.published = append(f.published, ids...) + return nil +} +func (f *fakeOutboxRepo) IncrementAttempts(_ context.Context, id int64, _ string) error { + f.incremented = append(f.incremented, id) + return nil +} +func (f *fakeOutboxRepo) MarkFailed(_ context.Context, id int64, _ string) error { + f.failed = append(f.failed, id) + return nil +} +func (f *fakeOutboxRepo) DeletePublishedBefore(_ context.Context, _ time.Time) (int64, error) { + return 0, nil +} +func (f *fakeOutboxRepo) CountPending(_ context.Context) (int64, error) { return 0, nil } + +// capturingQueue records the context handed to Enqueue. +type capturingQueue struct { + gotCtx context.Context + gotPayload map[string]interface{} +} + +func (q *capturingQueue) Enqueue(ctx context.Context, payload map[string]interface{}) (string, error) { + q.gotCtx = ctx + q.gotPayload = payload + return "1-0", nil +} + +func TestRelay_ReactivatesStoredTraceContext(t *testing.T) { + otel.SetTextMapPropagator(propagation.TraceContext{}) + + // Build a known producer span context and serialize it as a traceparent. + traceID, _ := trace.TraceIDFromHex("0af7651916cd43dd8448eb211c80319c") + spanID, _ := trace.SpanIDFromHex("b7ad6b7169203331") + sc := trace.NewSpanContext(trace.SpanContextConfig{ + TraceID: traceID, + SpanID: spanID, + TraceFlags: trace.FlagsSampled, + }) + carrier := propagation.MapCarrier{} + otel.GetTextMapPropagator().Inject( + trace.ContextWithSpanContext(context.Background(), sc), carrier) + tp := carrier.Get("traceparent") + if tp == "" { + t.Fatal("failed to build traceparent") + } + + payload, _ := json.Marshal(map[string]interface{}{"asset_id": uuid.New().String()}) + repo := &fakeOutboxRepo{pending: []models.OutboxEvent{ + {ID: 7, Event: "asset_uploaded", Payload: payload, Traceparent: &tp, MaxAttempts: 5}, + }} + q := &capturingQueue{} + + relay := NewRelay(repo, q, zap.NewNop(), nil, time.Second, 100) + relay.tick(context.Background()) + + if q.gotCtx == nil { + t.Fatal("Enqueue was not called") + } + gotSC := trace.SpanContextFromContext(q.gotCtx) + if !gotSC.IsValid() { + t.Fatal("expected a valid span context passed to Enqueue") + } + if gotSC.TraceID() != traceID { + t.Fatalf("trace id not propagated: want %s got %s", traceID, gotSC.TraceID()) + } + if len(repo.published) != 1 || repo.published[0] != 7 { + t.Fatalf("expected row 7 marked published, got %v", repo.published) + } +} + +func TestRelay_NoTraceparentStillPublishes(t *testing.T) { + otel.SetTextMapPropagator(propagation.TraceContext{}) + + payload, _ := json.Marshal(map[string]interface{}{"asset_id": uuid.New().String()}) + repo := &fakeOutboxRepo{pending: []models.OutboxEvent{ + {ID: 9, Event: "asset_uploaded", Payload: payload, MaxAttempts: 5}, + }} + q := &capturingQueue{} + + relay := NewRelay(repo, q, zap.NewNop(), nil, time.Second, 100) + relay.tick(context.Background()) + + if q.gotCtx == nil { + t.Fatal("Enqueue was not called") + } + if len(repo.published) != 1 || repo.published[0] != 9 { + t.Fatalf("expected row 9 marked published, got %v", repo.published) + } +} diff --git a/internal/queue/queue.go b/internal/queue/queue.go index 3197f11..2f4ccfc 100644 --- a/internal/queue/queue.go +++ b/internal/queue/queue.go @@ -15,6 +15,7 @@ import ( "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/propagation" "go.opentelemetry.io/otel/trace" ) @@ -126,6 +127,18 @@ func (rq *RedisQueue) Enqueue(ctx context.Context, payload map[string]interface{ "body": string(body), } + // Inject the active trace context as top-level message fields so the worker + // can extract it and continue the trace across the queue boundary. The + // propagator writes traceparent (and tracestate when present); we copy them + // out as separate stream fields alongside body, preserving existing keys. + carrier := propagation.MapCarrier{} + otel.GetTextMapPropagator().Inject(ctx, carrier) + for _, k := range []string{"traceparent", "tracestate", "baggage"} { + if v := carrier.Get(k); v != "" { + streamEntry[k] = v + } + } + args := &redis.XAddArgs{ Stream: rq.options.QueueName, Values: streamEntry, diff --git a/internal/repository/outbox_repo.go b/internal/repository/outbox_repo.go index 2ed7d3b..2676bb2 100644 --- a/internal/repository/outbox_repo.go +++ b/internal/repository/outbox_repo.go @@ -32,8 +32,8 @@ func NewOutboxRepository(db *sqlx.DB, logger *zap.Logger) OutboxRepository { func (r *outboxRepo) InsertTx(ctx context.Context, tx *sql.Tx, event models.OutboxEvent) error { _, err := tx.ExecContext(ctx, - `INSERT INTO event_outbox (aggregate_id, job_id, event, payload, max_attempts) VALUES ($1, $2, $3, $4, $5)`, - event.AggregateID, event.JobID, event.Event, event.Payload, event.MaxAttempts, + `INSERT INTO event_outbox (aggregate_id, job_id, event, payload, traceparent, max_attempts) VALUES ($1, $2, $3, $4, $5, $6)`, + event.AggregateID, event.JobID, event.Event, event.Payload, event.Traceparent, event.MaxAttempts, ) return err } @@ -41,7 +41,7 @@ func (r *outboxRepo) InsertTx(ctx context.Context, tx *sql.Tx, event models.Outb func (r *outboxRepo) FetchPendingBatch(ctx context.Context, limit int) ([]models.OutboxEvent, error) { var rows []models.OutboxEvent err := r.db.SelectContext(ctx, &rows, - `SELECT id, aggregate_id, job_id, event, payload, status, attempts, max_attempts, last_error, created_at, published_at + `SELECT id, aggregate_id, job_id, event, payload, traceparent, status, attempts, max_attempts, last_error, created_at, published_at FROM event_outbox WHERE status = 'pending' ORDER BY id LIMIT $1 FOR UPDATE SKIP LOCKED`, limit) return rows, err } diff --git a/internal/router/router.go b/internal/router/router.go index 7d44fea..1461967 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -105,9 +105,9 @@ func NewRouter(cfg config.EnvConfig, db *sqlx.DB, m *metrics.Metrics) *chi.Mux { AllowCredentials: true, MaxAge: 300, })) + r.Use(appMiddleware.TracingMiddleware) r.Use(appMiddleware.LoggerMiddleware(logger)) r.Use(middleware.Timeout(MiddlewareTimeout)) - r.Use(appMiddleware.TracingMiddleware) r.Use(appMiddleware.MetricsMiddleware(m)) r.Use(middleware.Compress(5)) r.Use(appMiddleware.SlowRequestMiddleware(logger, 2*time.Second)) diff --git a/internal/service/asset.go b/internal/service/asset.go index 86bb684..59fc4d5 100644 --- a/internal/service/asset.go +++ b/internal/service/asset.go @@ -19,6 +19,7 @@ import ( "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/propagation" "go.uber.org/zap" ) @@ -258,11 +259,23 @@ func (s *assetService) MarkAssetUploaded(ctx context.Context, assetID uuid.UUID) "event": "asset_uploaded", "timestamp": time.Now().UTC().Format(time.RFC3339), }) + + // Capture the active trace context so it survives the outbox store-and-forward + // hop. The relay (running on a background ticker context) re-activates this + // before publishing to Redis, keeping the whole pipeline in one trace. + carrier := propagation.MapCarrier{} + otel.GetTextMapPropagator().Inject(ctxOutbox, carrier) + var traceparent *string + if tp := carrier.Get("traceparent"); tp != "" { + traceparent = &tp + } + err = s.outboxRepo.InsertTx(ctxOutbox, tx, models.OutboxEvent{ AggregateID: assetID, JobID: jobID, Event: "asset_uploaded", Payload: payload, + Traceparent: traceparent, }) spanOutbox.End() diff --git a/loadtest/README.md b/loadtest/README.md new file mode 100644 index 0000000..c189af4 --- /dev/null +++ b/loadtest/README.md @@ -0,0 +1,69 @@ +# MPiper Load Harness (k6) — Track 3, Phase 4 + +Drives the **real** client flow from the host (presign → PUT to MinIO → +complete), so the whole pipeline — API, outbox relay, Redis, worker, ffmpeg — is +exercised end-to-end and observable as one trace per asset. + +## Install + +```bash +brew install k6 # macOS +# or see https://grafana.com/docs/k6/latest/set-up/install-k6/ +``` + +`run.sh` also needs `python3` with the `cryptography` package on the host (used +only to mint the AES-GCM auth token). + +## Prerequisites + +Bring the stack up **with the observability overlay** (so Prometheus accepts +k6's remote-write) and ideally the **loadtest overlay** (CPU-pinned, full +sampling) so runs are reproducible: + +```bash +docker compose \ + -f docker-compose.yml \ + -f docker-compose.observability.yml \ + -f docker-compose.loadtest.yml \ + up -d --build +``` + +## Run + +```bash +# CLOSED model — fixed VUs hammer the system (find max throughput / saturation) +./loadtest/run.sh closed --vus 10 --duration 2m +./loadtest/run.sh closed --vus 20 --duration 3m --ramp + +# OPEN model — fixed arrival rate (find the latency knee; watch queue lag grow) +./loadtest/run.sh open --rate 5/s --duration 3m +./loadtest/run.sh open --rate 10/s --duration 3m --max-vus 400 +``` + +Options: `--fixture PATH`, `--base-url URL`, `--no-prometheus`. + +## What to watch + +- **k6 terminal summary** — client-side request rate, error rate, and the custom + trends (`mpiper_presign_latency_ms`, `mpiper_upload_latency_ms`, + `mpiper_complete_latency_ms`). Thresholds map to the §4.2 SLOs and fail the run + on breach (exit non-zero). +- **Grafana** (http://localhost:3000) — the Track 3 dashboards: API RED, the + app-saturation/USE view (queue depth, in-flight, backlogs), the pipeline + funnel, and queue health. In the open model, queue depth climbing while the + API stays healthy is the worker bottleneck made visible. +- **Tempo** — click a latency exemplar on a histogram panel to jump straight to + the trace for that asset and see which span dominates. + +## Dedup fan-out + +The worker dedups by content hash, so identical bytes do almost no work after +the first asset. The harness appends per-iteration unique bytes **after** the +JPEG end-of-image marker (decoders ignore trailing bytes), yielding a valid but +unique-hash image so every iteration costs real work. See `lib.js`. + +## Caveat + +Local results are **relative**: trust the bottleneck *location* and +before/after deltas, not absolute throughput. Always record the resource limits +(from `docker-compose.loadtest.yml`) with each experiment. diff --git a/loadtest/closed_model.js b/loadtest/closed_model.js new file mode 100644 index 0000000..f66b37b --- /dev/null +++ b/loadtest/closed_model.js @@ -0,0 +1,42 @@ +// loadtest/closed_model.js +// +// CLOSED model: a fixed pool of VUs, each looping the upload flow as fast as +// the system allows. Good for finding max throughput and the saturation point +// of the (single-threaded) worker. +// +// VUS=10 DURATION=2m k6 run loadtest/closed_model.js +// STAGES=1 -> ramping profile (override via env, see below) +// +// Prefer the wrapper: ./loadtest/run.sh closed --vus 10 --duration 2m + +import { runUploadFlow, sloThresholds } from "./lib.js"; + +const VUS = parseInt(__ENV.VUS || "10", 10); +const DURATION = __ENV.DURATION || "2m"; + +export const options = { + scenarios: { + closed: __ENV.RAMP === "1" + ? { + executor: "ramping-vus", + startVUs: 1, + stages: [ + { duration: "30s", target: VUS }, + { duration: DURATION, target: VUS }, + { duration: "30s", target: 0 }, + ], + gracefulStop: "30s", + } + : { + executor: "constant-vus", + vus: VUS, + duration: DURATION, + gracefulStop: "30s", + }, + }, + thresholds: sloThresholds, +}; + +export default function () { + runUploadFlow(); +} diff --git a/loadtest/lib.js b/loadtest/lib.js new file mode 100644 index 0000000..5007d63 --- /dev/null +++ b/loadtest/lib.js @@ -0,0 +1,123 @@ +// loadtest/lib.js +// +// Shared helpers for the MPiper k6 load harness. Each iteration performs the +// REAL client flow from the host, exactly like scripts/demo-e2e.sh: +// +// 1. POST /api/v1/storage/presign -> uploadUrl + assetId +// 2. PUT (bytes straight to MinIO at the public endpoint) +// 3. GET /api/v1/assets/{assetId}/complete -> enqueues processing +// +// Dedup defeat: the worker dedups by content hash, so identical bytes do ~no +// work after the first asset. We append per-iteration unique bytes AFTER the +// JPEG end-of-image marker (decoders ignore trailing bytes), giving a valid but +// unique-hash image so we measure real per-job cost. See track-03 §7. + +import http from "k6/http"; +import { check } from "k6"; +import { Trend, Rate, Counter } from "k6/metrics"; + +// --- Config (host-run; see run.sh) --------------------------------------- +export const BASE_URL = __ENV.BASE_URL || "http://localhost:5010"; +const TOKEN = __ENV.K6_TOKEN || ""; // minted by run.sh (AES-GCM auth token) + +// --- Custom metrics mapped to the SLOs (§4.2) ---------------------------- +export const presignLatency = new Trend("mpiper_presign_latency_ms", true); +export const uploadLatency = new Trend("mpiper_upload_latency_ms", true); +export const completeLatency = new Trend("mpiper_complete_latency_ms", true); +export const flowErrors = new Rate("mpiper_flow_errors"); +export const assetsSubmitted = new Counter("mpiper_assets_submitted"); + +// --- Fixture (loaded once at init) --------------------------------------- +// open() resolves relative to this script. 'b' returns an ArrayBuffer. +const FIXTURE_PATH = + __ENV.FIXTURE_PATH || "../worker/tests/test_assets/image.jpg"; +const baseFixture = new Uint8Array(open(FIXTURE_PATH, "b")); + +function authHeaders(extra) { + return Object.assign({ Authorization: `Bearer ${TOKEN}` }, extra || {}); +} + +// Build a unique-hash image: base JPEG + a unique trailer (VU/iter/random). +function uniqueImageBytes() { + const tag = `\nMPIPER-LOADTEST-${__VU}-${__ITER}-${Math.random()}`; + // k6's JS runtime has no TextEncoder; the tag is ASCII so charCodeAt suffices. + const suffix = new Uint8Array(tag.length); + for (let i = 0; i < tag.length; i++) suffix[i] = tag.charCodeAt(i) & 0xff; + const out = new Uint8Array(baseFixture.length + suffix.length); + out.set(baseFixture, 0); + out.set(suffix, baseFixture.length); + return out.buffer; +} + +// Run one full presign -> upload -> complete flow. Returns true on success. +export function runUploadFlow() { + const bytes = uniqueImageBytes(); + const contentType = "image/jpeg"; + + // 1. presign + const presignRes = http.post( + `${BASE_URL}/api/v1/storage/presign`, + JSON.stringify({ + fileName: `loadtest-${__VU}-${__ITER}.jpg`, + contentType, + size: bytes.byteLength, + }), + { headers: authHeaders({ "Content-Type": "application/json" }), tags: { step: "presign" } } + ); + presignLatency.add(presignRes.timings.duration); + const presignOk = check(presignRes, { + "presign 2xx": (r) => r.status >= 200 && r.status < 300, + }); + if (!presignOk) { + flowErrors.add(1); + return false; + } + + const data = presignRes.json("data"); + if (!data || !data.uploadUrl || !data.assetId) { + flowErrors.add(1); + return false; + } + + // 2. upload bytes straight to object storage (public endpoint) + const uploadRes = http.put(data.uploadUrl, bytes, { + headers: { "Content-Type": contentType }, + tags: { step: "upload" }, + }); + uploadLatency.add(uploadRes.timings.duration); + const uploadOk = check(uploadRes, { + "upload 2xx": (r) => r.status >= 200 && r.status < 300, + }); + if (!uploadOk) { + flowErrors.add(1); + return false; + } + + // 3. complete -> enqueue processing + const completeRes = http.get( + `${BASE_URL}/api/v1/assets/${data.assetId}/complete`, + { headers: authHeaders(), tags: { step: "complete" } } + ); + completeLatency.add(completeRes.timings.duration); + const completeOk = check(completeRes, { + "complete 2xx": (r) => r.status >= 200 && r.status < 300, + }); + if (!completeOk) { + flowErrors.add(1); + return false; + } + + flowErrors.add(0); + assetsSubmitted.add(1); + return true; +} + +// Thresholds shared by both models, derived from the §4.2 SLOs. +export const sloThresholds = { + // Presign SLO: p95 < 150ms. + mpiper_presign_latency_ms: ["p(95)<150"], + // End-to-end client errors must stay under 1% (job success SLO > 99%). + mpiper_flow_errors: ["rate<0.01"], + // Overall check pass rate. + checks: ["rate>0.99"], +}; diff --git a/loadtest/open_model.js b/loadtest/open_model.js new file mode 100644 index 0000000..ead2bfd --- /dev/null +++ b/loadtest/open_model.js @@ -0,0 +1,35 @@ +// loadtest/open_model.js +// +// OPEN model: a fixed arrival rate of new uploads/sec, independent of how fast +// the system responds. Good for finding the latency knee and watching queue +// lag grow when arrival rate > service rate — a live demonstration of Little's +// Law (L = λW). When the worker can't keep up, the Redis stream depth climbs +// even though the API keeps accepting work. +// +// RATE=5 DURATION=3m k6 run loadtest/open_model.js +// +// Prefer the wrapper: ./loadtest/run.sh open --rate 5/s --duration 3m + +import { runUploadFlow, sloThresholds } from "./lib.js"; + +const RATE = parseInt(__ENV.RATE || "5", 10); // iterations/sec +const DURATION = __ENV.DURATION || "3m"; +const MAX_VUS = parseInt(__ENV.MAX_VUS || String(RATE * 20), 10); + +export const options = { + scenarios: { + open: { + executor: "constant-arrival-rate", + rate: RATE, + timeUnit: "1s", + duration: DURATION, + preAllocatedVUs: Math.max(10, RATE * 2), + maxVUs: MAX_VUS, + }, + }, + thresholds: sloThresholds, +}; + +export default function () { + runUploadFlow(); +} diff --git a/loadtest/run.sh b/loadtest/run.sh new file mode 100755 index 0000000..d6e58b3 --- /dev/null +++ b/loadtest/run.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# loadtest/run.sh — host-run wrapper for the MPiper k6 load harness. +# +# Usage: +# ./loadtest/run.sh closed --vus 10 --duration 2m [--ramp] +# ./loadtest/run.sh open --rate 5/s --duration 3m [--max-vus 200] +# +# Options (any model): +# --fixture PATH image fixture to fan out (default worker/tests/test_assets/image.jpg) +# --base-url URL API base (default http://localhost:5010) +# --no-prometheus do not stream k6 metrics to Prometheus remote-write +# +# Requires on the host: k6 (brew install k6), python3 with `cryptography` +# (only to mint the AES-GCM auth token), and the stack up with the +# observability overlay (so Prometheus remote-write is enabled). +set -euo pipefail + +MODEL="${1:-}" +if [[ "$MODEL" != "closed" && "$MODEL" != "open" ]]; then + echo "usage: $0 [options]" >&2 + exit 2 +fi +shift || true + +VUS=10 +DURATION="" +RATE=5 +MAX_VUS="" +RAMP=0 +FIXTURE="worker/tests/test_assets/image.jpg" +BASE_URL="http://localhost:5010" +USE_PROM=1 + +while [[ $# -gt 0 ]]; do + case "$1" in + --vus) VUS="$2"; shift 2 ;; + --duration) DURATION="$2"; shift 2 ;; + --rate) RATE="${2%/s}"; shift 2 ;; # accept "5/s" or "5" + --max-vus) MAX_VUS="$2"; shift 2 ;; + --ramp) RAMP=1; shift ;; + --fixture) FIXTURE="$2"; shift 2 ;; + --base-url) BASE_URL="$2"; shift 2 ;; + --no-prometheus) USE_PROM=0; shift ;; + *) echo "unknown option: $1" >&2; exit 2 ;; + esac +done + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# --- Mint an auth token (matches scripts/demo-e2e.sh / README) ----------- +ENCRYPTION_KEY="${ENCRYPTION_KEY:-0123456789abcdef0123456789abcdef}" +K6_TOKEN="$(ENCRYPTION_KEY="$ENCRYPTION_KEY" python3 - <<'PY' +import base64, os +from cryptography.hazmat.primitives.ciphers.aead import AESGCM +key = os.environ["ENCRYPTION_KEY"].encode() +nonce = os.urandom(12) +ct = AESGCM(key).encrypt(nonce, b"demo-user", None) +print(base64.urlsafe_b64encode(nonce + ct).rstrip(b"=").decode()) +PY +)" +export K6_TOKEN BASE_URL +export FIXTURE_PATH="$REPO_ROOT/$FIXTURE" + +# --- Stream client metrics to the bundled Prometheus (remote-write) ------ +K6_OUT=() +if [[ "$USE_PROM" == "1" ]]; then + export K6_PROMETHEUS_RW_SERVER_URL="${K6_PROMETHEUS_RW_SERVER_URL:-http://localhost:9090/api/v1/write}" + export K6_PROMETHEUS_RW_TREND_STATS="p(95),p(99),avg,max" + K6_OUT=(-o experimental-prometheus-rw) + echo "k6 → Prometheus remote-write at $K6_PROMETHEUS_RW_SERVER_URL" +fi + +cd "$REPO_ROOT" + +if [[ "$MODEL" == "closed" ]]; then + export VUS DURATION="${DURATION:-2m}" RAMP + echo "closed model: VUS=$VUS DURATION=$DURATION RAMP=$RAMP" + exec k6 run "${K6_OUT[@]}" loadtest/closed_model.js +else + export RATE DURATION="${DURATION:-3m}" + [[ -n "$MAX_VUS" ]] && export MAX_VUS + echo "open model: RATE=${RATE}/s DURATION=$DURATION" + exec k6 run "${K6_OUT[@]}" loadtest/open_model.js +fi diff --git a/observability/grafana/dashboards/dashboards.yaml b/observability/grafana/dashboards/dashboards.yaml new file mode 100644 index 0000000..63f23a9 --- /dev/null +++ b/observability/grafana/dashboards/dashboards.yaml @@ -0,0 +1,24 @@ +# ============================================================================ +# Grafana Dashboards — provider config +# +# IMPORTANT: this file must live in the *dashboards* provisioning directory +# (/etc/grafana/provisioning/dashboards), NOT the datasources one — Grafana +# only scans this directory for dashboard providers. It was previously misplaced +# under datasources/, so no JSON dashboards were ever loaded. +# ============================================================================ + +apiVersion: 1 + +providers: + - name: 'MPiper' + orgId: 1 + folder: 'MPiper' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + # Same dir the JSON dashboards are mounted into; Grafana loads *.json as + # dashboards and ignores this provider yaml. + path: /etc/grafana/provisioning/dashboards + foldersFromFilesStructure: true diff --git a/observability/grafana/dashboards/mpiper-api-red.json b/observability/grafana/dashboards/mpiper-api-red.json new file mode 100644 index 0000000..7d90e26 --- /dev/null +++ b/observability/grafana/dashboards/mpiper-api-red.json @@ -0,0 +1,85 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 0, + "id": null, + "uid": "mpiper-api-red", + "title": "MPiper — API RED", + "tags": ["mpiper", "track-3", "red"], + "timezone": "browser", + "schemaVersion": 39, + "time": { "from": "now-30m", "to": "now" }, + "refresh": "10s", + "templating": { "list": [] }, + "panels": [ + { + "id": 1, + "type": "timeseries", + "title": "Request rate (req/s) by route", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "expr": "sum by (http_route) (rate(mpiper_http_server_request_count_total[5m]))", + "legendFormat": "{{http_route}}" + } + ] + }, + { + "id": 2, + "type": "timeseries", + "title": "Error ratio (5xx) by route", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "percentunit", "max": 1, "min": 0 }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "expr": "sli:http_error_ratio:ratio_rate5m", + "legendFormat": "{{http_route}}" + } + ] + }, + { + "id": 3, + "type": "timeseries", + "title": "Duration p95 by route (exemplars → Tempo)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "expr": "sli:http_request_latency_seconds:p95", + "legendFormat": "{{http_route}} p95", + "exemplar": true + } + ] + }, + { + "id": 4, + "type": "stat", + "title": "Presign p95 (SLO < 150ms)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 0.15 } + ] + } + }, + "overrides": [] + }, + "targets": [ + { "refId": "A", "expr": "sli:presign_latency_seconds:p95", "legendFormat": "presign p95" } + ] + } + ] +} diff --git a/observability/grafana/dashboards/mpiper-experiment-overview.json b/observability/grafana/dashboards/mpiper-experiment-overview.json new file mode 100644 index 0000000..49d75ac --- /dev/null +++ b/observability/grafana/dashboards/mpiper-experiment-overview.json @@ -0,0 +1,212 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 1, + "id": null, + "uid": "mpiper-experiment-overview", + "title": "MPiper — Experiment Overview (Track 3)", + "description": "One pane for load experiments: k6 client load alongside server-side API/worker/queue metrics and the SLO summary.", + "tags": ["mpiper", "track-3", "experiment", "load"], + "timezone": "browser", + "schemaVersion": 39, + "time": { "from": "now-1h", "to": "now" }, + "refresh": "10s", + "templating": { "list": [] }, + "panels": [ + { "id": 100, "type": "row", "title": "Load — k6 client (host)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } }, + { + "id": 1, "type": "stat", "title": "Active VUs", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 1 }, + "fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "thresholds" } }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "max(k6_vus)", "legendFormat": "VUs" } ] + }, + { + "id": 2, "type": "timeseries", "title": "Throughput — iterations/s & assets submitted/s", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 6, "w": 12, "x": 6, "y": 1 }, + "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(rate(k6_iterations_total[1m]))", "legendFormat": "iterations/s" }, + { "refId": "B", "expr": "sum(rate(k6_mpiper_assets_submitted_total[1m]))", "legendFormat": "assets submitted/s" }, + { "refId": "C", "expr": "sum(rate(k6_http_reqs_total[1m]))", "legendFormat": "http reqs/s" } + ] + }, + { + "id": 3, "type": "stat", "title": "Client error rate (SLO < 1%)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 6, "w": 6, "x": 18, "y": 1 }, + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.01 } ] } }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "max(k6_mpiper_flow_errors_rate)", "legendFormat": "flow errors" }, + { "refId": "B", "expr": "max(k6_http_req_failed_rate)", "legendFormat": "http failed" } + ] + }, + { + "id": 4, "type": "timeseries", "title": "Client step latency p95 (ms)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "max(k6_mpiper_presign_latency_ms_p95)", "legendFormat": "presign p95" }, + { "refId": "B", "expr": "max(k6_mpiper_upload_latency_ms_p95)", "legendFormat": "upload p95" }, + { "refId": "C", "expr": "max(k6_mpiper_complete_latency_ms_p95)", "legendFormat": "complete p95" } + ] + }, + { + "id": 5, "type": "timeseries", "title": "Client HTTP duration p95 / p99 (ms)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "max(k6_http_req_duration_p95)", "legendFormat": "http_req_duration p95" }, + { "refId": "B", "expr": "max(k6_http_req_duration_p99)", "legendFormat": "http_req_duration p99" } + ] + }, + + { "id": 101, "type": "row", "title": "API — server RED", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 } }, + { + "id": 6, "type": "timeseries", "title": "Request rate by route (req/s)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 15 }, + "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "sum by (http_route) (rate(mpiper_http_server_request_count_total[1m]))", "legendFormat": "{{http_route}}" } ] + }, + { + "id": 7, "type": "timeseries", "title": "Duration p95 by route (exemplars → Tempo)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 15 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "sli:http_request_latency_seconds:p95", "legendFormat": "{{http_route}}", "exemplar": true } ] + }, + { + "id": 8, "type": "timeseries", "title": "In-flight requests & DB query rate", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 15 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(mpiper_http_server_active_requests)", "legendFormat": "in-flight" }, + { "refId": "B", "expr": "sum(rate(mpiper_db_query_total[1m]))", "legendFormat": "db queries/s" } + ] + }, + + { "id": 102, "type": "row", "title": "Worker — saturation & service rate (μ)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 } }, + { + "id": 9, "type": "timeseries", "title": "Service rate μ: consumed vs completed (jobs/s)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(rate(mpiper_mpiper_queue_message_consumed_total[2m]))", "legendFormat": "consumed/s" }, + { "refId": "B", "expr": "sum(rate(mpiper_mpiper_job_processing_success_total[2m]))", "legendFormat": "completed/s (μ)" } + ] + }, + { + "id": 10, "type": "timeseries", "title": "Asset processing p95 by type (s) — exemplars", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "histogram_quantile(0.95, sum by (le, asset_type) (rate(mpiper_mpiper_asset_processing_duration_seconds_bucket[5m])))", "legendFormat": "{{asset_type}} p95", "exemplar": true } ] + }, + { + "id": 11, "type": "timeseries", "title": "Asset processing mean (s) by type", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "sum by (asset_type)(rate(mpiper_mpiper_asset_processing_duration_seconds_sum[5m])) / sum by (asset_type)(rate(mpiper_mpiper_asset_processing_duration_seconds_count[5m]))", "legendFormat": "{{asset_type}} mean" } ] + }, + { + "id": 12, "type": "timeseries", "title": "Job processing p95 (s)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_mpiper_job_processing_duration_seconds_bucket[5m])))", "legendFormat": "job p95" } ] + }, + + { "id": 103, "type": "row", "title": "Pipeline funnel & backlogs", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 } }, + { + "id": 13, "type": "timeseries", "title": "Funnel: uploaded → ready / failed (per s)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(rate(mpiper_asset_upload_total[2m]))", "legendFormat": "uploaded/s" }, + { "refId": "B", "expr": "sum(rate(mpiper_mpiper_asset_processing_success_total[2m]))", "legendFormat": "ready/s" }, + { "refId": "C", "expr": "sum(rate(mpiper_mpiper_asset_processing_failed_total[2m]))", "legendFormat": "failed/s" } + ] + }, + { + "id": 14, "type": "timeseries", "title": "Queue depth (Redis stream length)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }, + "fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 20 } }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "max(mpiper_queue_depth)", "legendFormat": "queue depth" } ] + }, + { + "id": 15, "type": "timeseries", "title": "Backlogs: outbox & webhook pending", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "max(mpiper_outbox_pending)", "legendFormat": "outbox pending" }, + { "refId": "B", "expr": "max(mpiper_webhook_pending)", "legendFormat": "webhook pending" } + ] + }, + + { "id": 104, "type": "row", "title": "Queue / transport", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 } }, + { + "id": 16, "type": "timeseries", "title": "Publish vs consume (msg/s)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 46 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(rate(mpiper_queue_message_published_total[2m]))", "legendFormat": "published/s" }, + { "refId": "B", "expr": "sum(rate(mpiper_mpiper_queue_message_consumed_total[2m]))", "legendFormat": "consumed/s" } + ] + }, + { + "id": 17, "type": "timeseries", "title": "Queue wait p95 & outbox relay lag p95 (s)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 46 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sli:queue_wait_seconds:p95", "legendFormat": "queue wait p95 (publish-side)" }, + { "refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_outbox_relay_lag_seconds_bucket[5m])))", "legendFormat": "outbox relay lag p95" } + ] + }, + + { "id": 105, "type": "row", "title": "SLO summary (vs targets)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 53 } }, + { + "id": 18, "type": "stat", "title": "Presign p95 (< 150ms)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 54 }, + "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.15 } ] } }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "sli:presign_latency_seconds:p95" } ] + }, + { + "id": 19, "type": "stat", "title": "Image ready p95 (< 5s)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 54 }, + "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 5 } ] } }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "sli:image_ready_latency_seconds:p95" } ] + }, + { + "id": 20, "type": "stat", "title": "Queue wait p95 (< 2s)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 6, "w": 6, "x": 12, "y": 54 }, + "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 2 } ] } }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "sli:queue_wait_seconds:p95" } ] + }, + { + "id": 21, "type": "stat", "title": "Job success rate (> 99%)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 6, "w": 6, "x": 18, "y": 54 }, + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] } }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "sli:job_success_ratio:ratio_rate5m" } ] + }, + + { "id": 106, "type": "row", "title": "Database (connection pool & queries)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 } }, + { + "id": 22, "type": "timeseries", "title": "Connection pool (in-use / idle / open vs max)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 61 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "mpiper_db_connections_active", "legendFormat": "in-use" }, + { "refId": "B", "expr": "mpiper_db_connections_idle", "legendFormat": "idle" }, + { "refId": "C", "expr": "mpiper_db_connections_open", "legendFormat": "open" }, + { "refId": "D", "expr": "mpiper_db_connections_max_open", "legendFormat": "max open" } + ] + }, + { + "id": 23, "type": "timeseries", "title": "DB query rate (q/s), p95 latency & pool waits", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 61 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [ + { "matcher": { "id": "byName", "options": "query p95" }, "properties": [ { "id": "unit", "value": "s" } ] } + ] }, + "targets": [ + { "refId": "A", "expr": "sum(rate(mpiper_db_query_total[2m]))", "legendFormat": "queries/s" }, + { "refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_db_query_duration_seconds_bucket[5m])))", "legendFormat": "query p95" }, + { "refId": "C", "expr": "rate(mpiper_db_connections_wait_count[5m])", "legendFormat": "pool waits/s" } + ] + } + ] +} diff --git a/observability/grafana/dashboards/mpiper-metrics.json b/observability/grafana/dashboards/mpiper-metrics.json index e24f49f..edb9e6c 100644 --- a/observability/grafana/dashboards/mpiper-metrics.json +++ b/observability/grafana/dashboards/mpiper-metrics.json @@ -38,7 +38,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -87,7 +87,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "rate(mpiper_http_server_request_count_total[5m])", "refId": "A" @@ -99,7 +99,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -152,9 +152,9 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_bucket[5m]))", + "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))", "refId": "A" } ], @@ -164,7 +164,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -238,7 +238,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "mpiper_http_server_active_requests", "refId": "A" @@ -250,7 +250,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -299,9 +299,9 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, - "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"5..\"}[5m]) / rate(mpiper_http_server_request_count_total[5m])", + "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"5..\"}[5m]) / rate(mpiper_http_server_request_count_total[5m])", "refId": "A" } ], @@ -324,7 +324,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -398,27 +398,27 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, rate(mpiper_http_server_request_duration_bucket[5m]))", + "expr": "histogram_quantile(0.50, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))", "legendFormat": "p50", "refId": "A" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, rate(mpiper_http_server_request_duration_bucket[5m]))", + "expr": "histogram_quantile(0.95, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))", "legendFormat": "p95", "refId": "B" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_bucket[5m]))", + "expr": "histogram_quantile(0.99, rate(mpiper_http_server_request_duration_seconds_bucket[5m]))", "legendFormat": "p99", "refId": "C" } @@ -429,7 +429,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -503,27 +503,27 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, - "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"2..\"}[5m])", + "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"2..\"}[5m])", "legendFormat": "2xx - {{method}} {{path}}", "refId": "A" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, - "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"4..\"}[5m])", + "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"4..\"}[5m])", "legendFormat": "4xx - {{method}} {{path}}", "refId": "B" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, - "expr": "rate(mpiper_http_server_request_count_total{status_code=~\"5..\"}[5m])", + "expr": "rate(mpiper_http_server_request_count_total{http_status_code=~\"5..\"}[5m])", "legendFormat": "5xx - {{method}} {{path}}", "refId": "C" } @@ -547,7 +547,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -621,7 +621,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "rate(mpiper_asset_upload_total[5m])", "legendFormat": "Uploads", @@ -630,7 +630,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "rate(mpiper_asset_processing_success_total[5m])", "legendFormat": "Success", @@ -639,7 +639,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "rate(mpiper_asset_processing_failed_total[5m])", "legendFormat": "Failed", @@ -652,7 +652,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -726,7 +726,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "histogram_quantile(0.95, rate(mpiper_asset_upload_duration_bucket[5m]))", "legendFormat": "Upload p95", @@ -735,7 +735,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "histogram_quantile(0.95, rate(mpiper_asset_processing_duration_bucket[5m]))", "legendFormat": "Processing p95", @@ -748,7 +748,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -791,7 +791,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "histogram_quantile(0.50, rate(mpiper_asset_size_bucket[5m]))", "legendFormat": "p50", @@ -800,7 +800,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "histogram_quantile(0.95, rate(mpiper_asset_size_bucket[5m]))", "legendFormat": "p95", @@ -809,7 +809,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "histogram_quantile(0.99, rate(mpiper_asset_size_bucket[5m]))", "legendFormat": "p99", @@ -835,7 +835,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -909,7 +909,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "histogram_quantile(0.50, rate(mpiper_db_query_duration_bucket[5m]))", "legendFormat": "p50", @@ -918,7 +918,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "histogram_quantile(0.95, rate(mpiper_db_query_duration_bucket[5m]))", "legendFormat": "p95", @@ -927,7 +927,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "histogram_quantile(0.99, rate(mpiper_db_query_duration_bucket[5m]))", "legendFormat": "p99", @@ -940,7 +940,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -1014,7 +1014,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "mpiper_db_connections_active", "legendFormat": "Active", @@ -1023,7 +1023,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "mpiper_db_connections_idle", "legendFormat": "Idle", @@ -1036,7 +1036,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "fieldConfig": { "defaults": { @@ -1129,7 +1129,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "rate(mpiper_db_query_success_total[5m])", "legendFormat": "Success", @@ -1138,7 +1138,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "expr": "rate(mpiper_db_query_failed_total[5m])", "legendFormat": "Errors", diff --git a/observability/grafana/dashboards/mpiper-pipeline-funnel.json b/observability/grafana/dashboards/mpiper-pipeline-funnel.json new file mode 100644 index 0000000..c2b254a --- /dev/null +++ b/observability/grafana/dashboards/mpiper-pipeline-funnel.json @@ -0,0 +1,90 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 0, + "id": null, + "uid": "mpiper-pipeline-funnel", + "title": "MPiper — Pipeline Funnel & Stage Latency", + "tags": ["mpiper", "track-3", "funnel"], + "timezone": "browser", + "schemaVersion": 39, + "time": { "from": "now-30m", "to": "now" }, + "refresh": "10s", + "templating": { "list": [] }, + "panels": [ + { + "id": 1, + "type": "timeseries", + "title": "Funnel rate: uploaded → processed (success/failed)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(rate(mpiper_asset_upload_total[5m]))", "legendFormat": "uploaded/s" }, + { "refId": "B", "expr": "sum(rate(mpiper_mpiper_asset_processing_success_total[5m]))", "legendFormat": "ready/s" }, + { "refId": "C", "expr": "sum(rate(mpiper_mpiper_asset_processing_failed_total[5m]))", "legendFormat": "failed/s" } + ] + }, + { + "id": 2, + "type": "stat", + "title": "Image ready p95 (SLO < 5s)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 8 }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 5 } ] } + }, + "overrides": [] + }, + "targets": [ { "refId": "A", "expr": "sli:image_ready_latency_seconds:p95", "legendFormat": "image p95" } ] + }, + { + "id": 3, + "type": "stat", + "title": "Video ready p95 (SLO < 60s)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 8 }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 60 } ] } + }, + "overrides": [] + }, + "targets": [ { "refId": "A", "expr": "sli:video_ready_latency_seconds:p95", "legendFormat": "video p95" } ] + }, + { + "id": 4, + "type": "stat", + "title": "Job success rate (SLO > 99%)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", "min": 0, "max": 1, + "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] } + }, + "overrides": [] + }, + "targets": [ { "refId": "A", "expr": "sli:job_success_ratio:ratio_rate5m", "legendFormat": "success rate" } ] + }, + { + "id": 5, + "type": "timeseries", + "title": "Worker asset processing p95 by type (exemplars → Tempo)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.95, sum by (le, asset_type) (rate(mpiper_mpiper_asset_processing_duration_seconds_bucket[5m])))", + "legendFormat": "{{asset_type}} p95", + "exemplar": true + } + ] + } + ] +} diff --git a/observability/grafana/dashboards/mpiper-queue-health.json b/observability/grafana/dashboards/mpiper-queue-health.json new file mode 100644 index 0000000..2523550 --- /dev/null +++ b/observability/grafana/dashboards/mpiper-queue-health.json @@ -0,0 +1,58 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 0, + "id": null, + "uid": "mpiper-queue-health", + "title": "MPiper — Queue Health", + "tags": ["mpiper", "track-3", "queue"], + "timezone": "browser", + "schemaVersion": 39, + "time": { "from": "now-30m", "to": "now" }, + "refresh": "10s", + "templating": { "list": [] }, + "panels": [ + { + "id": 1, + "type": "timeseries", + "title": "Stream length (queue depth)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "max(mpiper_queue_depth)", "legendFormat": "queue depth" } ] + }, + { + "id": 2, + "type": "timeseries", + "title": "Publish vs consume rate (msg/s)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(rate(mpiper_queue_message_published_total[5m]))", "legendFormat": "published/s" }, + { "refId": "B", "expr": "sum(rate(mpiper_mpiper_queue_message_consumed_total[5m]))", "legendFormat": "consumed/s" } + ] + }, + { + "id": 3, + "type": "timeseries", + "title": "Queue wait p95 (publish-side proxy; authoritative in Tempo)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ { "refId": "A", "expr": "sli:queue_wait_seconds:p95", "legendFormat": "queue wait p95" } ] + }, + { + "id": 4, + "type": "timeseries", + "title": "Webhook delivery p95 (SLO < 10s) + pending", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sli:webhook_delivery_latency_seconds:p95", "legendFormat": "delivery p95" }, + { "refId": "B", "expr": "sli:webhook_pending:current", "legendFormat": "pending (count)" } + ] + } + ] +} diff --git a/observability/grafana/dashboards/mpiper-worker-use.json b/observability/grafana/dashboards/mpiper-worker-use.json new file mode 100644 index 0000000..0ab7fd2 --- /dev/null +++ b/observability/grafana/dashboards/mpiper-worker-use.json @@ -0,0 +1,89 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 0, + "id": null, + "uid": "mpiper-worker-use", + "title": "MPiper — Worker / App Saturation (USE)", + "tags": ["mpiper", "track-3", "use"], + "timezone": "browser", + "schemaVersion": 39, + "time": { "from": "now-30m", "to": "now" }, + "refresh": "10s", + "templating": { "list": [] }, + "panels": [ + { + "id": 1, + "type": "row", + "title": "Saturation (app-level: no per-container CPU by design)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + { + "id": 2, + "type": "timeseries", + "title": "Queue depth (Redis stream length)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sli:queue_depth:current", "legendFormat": "queue depth" } + ] + }, + { + "id": 3, + "type": "timeseries", + "title": "In-flight HTTP requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(mpiper_http_server_active_requests)", "legendFormat": "active requests" } + ] + }, + { + "id": 4, + "type": "timeseries", + "title": "Outbox relay lag p95 (oldest pending age)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.95, sum by (le) (rate(mpiper_outbox_relay_lag_seconds_bucket[5m])))", + "legendFormat": "relay lag p95" + } + ] + }, + { + "id": 5, + "type": "timeseries", + "title": "Backlogs: outbox + webhook pending", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sli:outbox_pending:current", "legendFormat": "outbox pending" }, + { "refId": "B", "expr": "sli:webhook_pending:current", "legendFormat": "webhook pending" } + ] + }, + { + "id": 6, + "type": "row", + "title": "Errors", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 } + }, + { + "id": 7, + "type": "timeseries", + "title": "Queue message failures + outbox publish failures (rate)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(rate(mpiper_queue_message_failed_total[5m]))", "legendFormat": "queue failures/s" }, + { "refId": "B", "expr": "sum(rate(mpiper_outbox_publish_failures_total[5m]))", "legendFormat": "outbox publish failures/s" } + ] + } + ] +} diff --git a/observability/grafana/datasources/dashboards.yml b/observability/grafana/datasources/dashboards.yml deleted file mode 100644 index 3e5981b..0000000 --- a/observability/grafana/datasources/dashboards.yml +++ /dev/null @@ -1,17 +0,0 @@ -# ============================================================================ -# Grafana Dashboards - Auto-provisioning -# ============================================================================ - -apiVersion: 1 - -providers: - - name: 'Default' - orgId: 1 - folder: 'MPiper' - type: file - disableDeletion: false - updateIntervalSeconds: 10 - allowUiUpdates: true - options: - path: /etc/grafana/provisioning/dashboards - foldersFromFilesStructure: true diff --git a/observability/grafana/datasources/datasources.yml b/observability/grafana/datasources/datasources.yml index 1d35862..a8514be 100644 --- a/observability/grafana/datasources/datasources.yml +++ b/observability/grafana/datasources/datasources.yml @@ -57,8 +57,11 @@ datasources: url: http://loki:3100 jsonData: derivedFields: + # Match trace_id across formats: worker (trace_id=abc), API JSON + # ("trace_id":"abc"), and API dev pretty (trace_id: abc). \W+ swallows + # the separator (=, ":", ": ") between the key and the hex value. - datasourceUid: tempo - matcherRegex: "trace_id=(\\w+)" + matcherRegex: "trace_id\\W+(\\w+)" name: TraceID url: '$${__value.raw}' editable: true diff --git a/observability/prometheus.rules.yml b/observability/prometheus.rules.yml new file mode 100644 index 0000000..005f54a --- /dev/null +++ b/observability/prometheus.rules.yml @@ -0,0 +1,126 @@ +# ============================================================================ +# MPiper — SLO recording rules (Track 3, Phase 3) +# +# Encodes the SLIs from track-03-observability-and-load.md §4.2 as recording +# rules so each renders directly on a dashboard against its target. +# +# Metric naming convention +# ------------------------- +# App telemetry is OTLP → the collector's Prometheus exporter, which is +# configured with `namespace: mpiper` (see observability/otel-collector.yml). +# OTel instrument names have dots replaced by underscores and the unit appended: +# +# Go API http.server.request.duration (s) -> mpiper_http_server_request_duration_seconds_{bucket,sum,count} +# Go API asset.processing.success (counter) -> mpiper_asset_processing_success_total +# Worker mpiper.job.processing.success -> mpiper_mpiper_job_processing_success_total +# (worker instruments already carry a `mpiper.` prefix, so the +# collector namespace yields a double `mpiper_mpiper_` prefix) +# +# If your collector build sets `add_metric_suffixes: false` or a different +# namespace, adjust the series names below. Validate with: +# promtool check rules observability/prometheus.rules.yml +# and against live series in Prometheus → Status → Targets after a load run. +# ============================================================================ + +groups: + - name: mpiper_slo + interval: 15s + rules: + # --- Presign latency: p95 of POST /api/v1/storage/presign (target < 150ms) + - record: sli:presign_latency_seconds:p95 + expr: > + histogram_quantile( + 0.95, + sum by (le) ( + rate(mpiper_http_server_request_duration_seconds_bucket{http_route="/api/v1/storage/presign"}[5m]) + ) + ) + + # --- API overall request latency p95 per route (RED "Duration") + - record: sli:http_request_latency_seconds:p95 + expr: > + histogram_quantile( + 0.95, + sum by (le, http_route) ( + rate(mpiper_http_server_request_duration_seconds_bucket[5m]) + ) + ) + + # --- API error ratio per route (RED "Errors"): 5xx / all + - record: sli:http_error_ratio:ratio_rate5m + expr: > + sum by (http_route) ( + rate(mpiper_http_server_request_count_total{http_status_code=~"5.."}[5m]) + ) + / + clamp_min( + sum by (http_route) (rate(mpiper_http_server_request_count_total[5m])), + 1 + ) + + # --- Image ready latency: p95 of worker asset processing for images. + # Server-side processing proxy for complete -> ready; queue wait is added + # on top and is best read from the trace waterfall (enqueue -> consume). + - record: sli:image_ready_latency_seconds:p95 + expr: > + histogram_quantile( + 0.95, + sum by (le) ( + rate(mpiper_mpiper_asset_processing_duration_seconds_bucket{asset_type="image"}[5m]) + ) + ) + + # --- Video ready latency: p95 of worker asset processing for videos (target < 60s) + - record: sli:video_ready_latency_seconds:p95 + expr: > + histogram_quantile( + 0.95, + sum by (le) ( + rate(mpiper_mpiper_asset_processing_duration_seconds_bucket{asset_type="video"}[5m]) + ) + ) + + # --- Queue wait: p95 of the producer-side processing lag histogram. + # The authoritative enqueue -> consume gap is visible per-asset in Tempo; + # this metric approximates the publish-side contribution. + - record: sli:queue_wait_seconds:p95 + expr: > + histogram_quantile( + 0.95, + sum by (le) ( + rate(mpiper_queue_processing_lag_seconds_bucket[5m]) + ) + ) + + # --- Job success rate: done / (done + failed) (target > 99%) + # `or vector(0)` guards the empty-vector pitfall: with zero failures the + # failed series does not exist, and a bare A + B would yield no result. + - record: sli:job_success_ratio:ratio_rate5m + expr: > + (sum(rate(mpiper_mpiper_job_processing_success_total[5m])) or vector(0)) + / + clamp_min( + (sum(rate(mpiper_mpiper_job_processing_success_total[5m])) or vector(0)) + + (sum(rate(mpiper_mpiper_job_processing_failed_total[5m])) or vector(0)), + 1 + ) + + # --- Webhook delivery latency: p95 of delivery HTTP calls (target < 10s) + - record: sli:webhook_delivery_latency_seconds:p95 + expr: > + histogram_quantile( + 0.95, + sum by (le) ( + rate(mpiper_webhook_delivery_duration_seconds_bucket[5m]) + ) + ) + + # --- Queue health snapshots (gauges already exported by the app) + - record: sli:queue_depth:current + expr: max(mpiper_queue_depth) + + - record: sli:outbox_pending:current + expr: max(mpiper_outbox_pending) + + - record: sli:webhook_pending:current + expr: max(mpiper_webhook_pending) diff --git a/observability/prometheus.yml b/observability/prometheus.yml index a628c78..bf75a95 100644 --- a/observability/prometheus.yml +++ b/observability/prometheus.yml @@ -9,6 +9,11 @@ global: cluster: 'mpiper-local' environment: 'development' +# SLO recording rules (Track 3, Phase 3). Mounted into the container at +# /etc/prometheus/ alongside this file by docker-compose.observability.yml. +rule_files: + - /etc/prometheus/prometheus.rules.yml + # Alertmanager configuration (optional) # alerting: # alertmanagers: diff --git a/worker/consumer/consumer.py b/worker/consumer/consumer.py index c66c990..375daa9 100644 --- a/worker/consumer/consumer.py +++ b/worker/consumer/consumer.py @@ -28,16 +28,21 @@ from __future__ import annotations import time +from contextlib import nullcontext from typing import Dict import redis from redis.exceptions import ResponseError +from opentelemetry import trace +from opentelemetry.propagate import extract from worker.consumer.config import WorkerConfig from worker.consumer.db import PgPool from worker.processing.processor import RetryableException, process_asset_dispatch from worker.storage.base import StorageX from worker.utils.logger import get_logger +from worker.utils.tracing import get_tracer +from worker.utils import metrics as wm from worker.webhooks import insert_webhook_deliveries logger = get_logger(__name__) @@ -159,6 +164,7 @@ def consume(self, consumer_name: str) -> bool: # Response format: [(stream_name, [(msg_id, {field: value}), ...])] _, messages = resp[0] msg_id, fields = messages[0] + wm.record_consume() try: # Normalize fields to a dict @@ -180,14 +186,25 @@ def consume(self, consumer_name: str) -> bool: job_id = payload.get("job_id") asset_id = payload.get("asset_id") - if job_id: - self._handle_job(job_id, msg_id) - elif asset_id: - self._handle_asset_message(asset_id, msg_id) - else: - logger.error("message missing job_id and asset_id: %s", payload) - # Acknowledge to remove the malformed message from the stream. - self.redis.xack(self.cfg.stream_name, self.cfg.consumer_group, msg_id) + # Extract the producer trace context (injected by the Go relay) and + # continue the trace here. traceparent may be a top-level stream + # field or have been merged in from `body` above. We start the + # consume span as a CHILD of the producer context (keeps the Tempo + # waterfall readable) AND attach a link to it — a link is the correct + # primitive for queue fan-in, where one consumer may join many + # producers. The gap between the producer's enqueue span and this + # span is the queue wait time. + with self._consume_span(payload, msg_id, job_id, asset_id): + if job_id: + self._handle_job(job_id, msg_id) + elif asset_id: + self._handle_asset_message(asset_id, msg_id) + else: + logger.error("message missing job_id and asset_id: %s", payload) + # Acknowledge to remove the malformed message from the stream. + self.redis.xack( + self.cfg.stream_name, self.cfg.consumer_group, msg_id + ) except Exception: logger.exception("unhandled exception while processing message %s", msg_id) # Do not ack the message so it remains in the pending entries list @@ -195,6 +212,43 @@ def consume(self, consumer_name: str) -> bool: return True + def _consume_span(self, payload, msg_id, job_id, asset_id): + """Start the worker.consume span continuing the producer trace. + + Returns a context manager. When tracing is not initialised (telemetry + failed at startup) this is a no-op so message processing is unaffected. + """ + tracer = get_tracer() + if tracer is None: + return nullcontext() + + carrier = { + k: payload[k] + for k in ("traceparent", "tracestate", "baggage") + if k in payload + } + parent_ctx = extract(carrier) + producer_sc = trace.get_current_span(parent_ctx).get_span_context() + links = [trace.Link(producer_sc)] if producer_sc.is_valid else None + + attrs = { + "messaging.system": "redis", + "messaging.destination.name": self.cfg.stream_name, + "messaging.message.id": msg_id, + } + if job_id: + attrs["job_id"] = str(job_id) + if asset_id: + attrs["asset_id"] = str(asset_id) + + return tracer.start_as_current_span( + "worker.consume", + context=parent_ctx, + kind=trace.SpanKind.CONSUMER, + links=links, + attributes=attrs, + ) + def _handle_job(self, job_id: int, msg_id: str) -> None: """Load the job row, mark it in-progress, run processing, and finalize. @@ -233,10 +287,12 @@ def _handle_job(self, job_id: int, msg_id: str) -> None: conn.commit() # Run the processing outside the DB transaction. + job_start = time.time() try: process_asset_dispatch(asset_id, self.pg, self.storage, self.cfg) except Exception as exc: logger.exception("processing failed for job=%s asset=%s", job_id, asset_id) + wm.record_job(success=False, duration_seconds=time.time() - job_start) with self.pg.get_pg_conn() as conn: cur = conn.cursor() @@ -285,6 +341,8 @@ def _handle_job(self, job_id: int, msg_id: str) -> None: insert_webhook_deliveries(cur, asset_id, job_id, "job.done") conn.commit() + wm.record_job(success=True, duration_seconds=time.time() - job_start) + # Acknowledge the Redis stream message. self.redis.xack(self.cfg.stream_name, self.cfg.consumer_group, msg_id) diff --git a/worker/consumer/main.py b/worker/consumer/main.py index 673298c..c94bfff 100644 --- a/worker/consumer/main.py +++ b/worker/consumer/main.py @@ -10,6 +10,7 @@ from worker.consumer.migrations import run_migrations from worker.storage import get_storage from worker.utils import metrics as worker_metrics +from worker.utils import tracing as worker_tracing logger = logging.getLogger(__name__) @@ -19,6 +20,33 @@ def main(): logger.info("Starting worker consumer...") cfg = get_config() + + # Initialise telemetry before anything else so startup is observable. + # NOTE: init_metrics() was previously never called — worker OTel metrics + # were defined but never wired up. We initialise both tracing and metrics + # here from the same OtelConfig so they share endpoint/resource/lifecycle. + otel = cfg.otel + try: + worker_tracing.init_tracing( + service_name=otel.service_name, + service_version=otel.service_version, + endpoint=otel.endpoint, + deployment_env=otel.deployment_env, + instance_id=otel.instance_id, + tls_insecure=otel.tls_insecure, + ) + worker_metrics.init_metrics( + service_name=otel.service_name, + service_version=otel.service_version, + endpoint=otel.endpoint, + deployment_env=otel.deployment_env, + instance_id=otel.instance_id, + tls_insecure=otel.tls_insecure, + ) + except Exception: + # Telemetry must never prevent the worker from processing jobs. + logger.exception("failed to initialise telemetry; continuing without it") + storage = get_storage(cfg) password = quote_plus(cfg.database.password) @@ -60,8 +88,9 @@ def _term(signum, frame): time.sleep(1) logger.info("exiting") - - # Shutdown metrics on exit + + # Shutdown telemetry on exit (flush pending spans + metrics). + worker_tracing.shutdown_tracing() worker_metrics.shutdown_metrics() diff --git a/worker/processing/images.py b/worker/processing/images.py index e4c03f5..83171ed 100644 --- a/worker/processing/images.py +++ b/worker/processing/images.py @@ -3,7 +3,10 @@ import logging import os +from opentelemetry import trace + logger = logging.getLogger("images") +tracer = trace.get_tracer("worker.processing.images") IMAGE_VARIANTS = [ { @@ -68,40 +71,48 @@ def process_image_file( role = v["role"] logger.info("generating image variant %s for asset %s", role, asset_id) - if v["crop"]: - out_img = ImageOps.fit( - img, - (v["width"], v["height"]), - Image.LANCZOS, - centering=(0.5, 0.5), - ) - else: - target_w = v["width"] or src_width - ratio = target_w / float(src_width) - target_h = int(src_height * ratio) - out_img = img.resize((target_w, target_h), Image.LANCZOS) - - data = encode_image(out_img, v["format"], v["quality"]) - - key = f"media/processed/{asset_id}/{role}.{v['format']}" - storage.upload_bytes(key, data, content_type=f"image/{v['format']}") - url = storage.public_url(key) - - # Upsert into variants.image (PK is asset_id + role) - with pg_pool.get_pg_conn() as conn: - conn.cursor().execute( - """ - INSERT INTO variants.image (asset_id, url, role, width, height, size_bytes, format) - VALUES (%s, %s, %s, %s, %s, %s, %s) - ON CONFLICT (asset_id, role) DO UPDATE SET - url = EXCLUDED.url, - width = EXCLUDED.width, - height = EXCLUDED.height, - size_bytes = EXCLUDED.size_bytes, - format = EXCLUDED.format - """, - (asset_id, url, role, out_img.width, out_img.height, len(data), v["format"]), - ) + with tracer.start_as_current_span("image.variant") as span: + span.set_attribute("asset_id", asset_id) + span.set_attribute("variant.role", role) + span.set_attribute("variant.format", v["format"]) + + if v["crop"]: + out_img = ImageOps.fit( + img, + (v["width"], v["height"]), + Image.LANCZOS, + centering=(0.5, 0.5), + ) + else: + target_w = v["width"] or src_width + ratio = target_w / float(src_width) + target_h = int(src_height * ratio) + out_img = img.resize((target_w, target_h), Image.LANCZOS) + + data = encode_image(out_img, v["format"], v["quality"]) + span.set_attribute("variant.width", out_img.width) + span.set_attribute("variant.height", out_img.height) + span.set_attribute("variant.size_bytes", len(data)) + + key = f"media/processed/{asset_id}/{role}.{v['format']}" + storage.upload_bytes(key, data, content_type=f"image/{v['format']}") + url = storage.public_url(key) + + # Upsert into variants.image (PK is asset_id + role) + with pg_pool.get_pg_conn() as conn: + conn.cursor().execute( + """ + INSERT INTO variants.image (asset_id, url, role, width, height, size_bytes, format) + VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (asset_id, role) DO UPDATE SET + url = EXCLUDED.url, + width = EXCLUDED.width, + height = EXCLUDED.height, + size_bytes = EXCLUDED.size_bytes, + format = EXCLUDED.format + """, + (asset_id, url, role, out_img.width, out_img.height, len(data), v["format"]), + ) # Mark asset ready with pg_pool.get_pg_conn() as conn: diff --git a/worker/processing/processor.py b/worker/processing/processor.py index 9e6244f..5166fce 100644 --- a/worker/processing/processor.py +++ b/worker/processing/processor.py @@ -1,6 +1,9 @@ import os +import time from enum import Enum +from opentelemetry import trace + from worker.consumer.config import WorkerConfig from worker.consumer.db import PgPool from worker.processing.images import process_image_file @@ -8,8 +11,10 @@ from worker.storage.base import StorageX from worker.utils.hash import compute_file_hash from worker.utils.logger import get_logger +from worker.utils import metrics as wm logger = get_logger(__name__) +tracer = trace.get_tracer("worker.processing") class AssetStatus(Enum): @@ -133,88 +138,119 @@ def process_asset_dispatch( """ Main entry point for asset processing with deduplication. """ - # 1. Load asset metadata - with pg_pool.get_pg_conn() as conn: - cur = conn.cursor() - cur.execute( - """ - SELECT asset_id, type, status, original_url, mime_type, content_hash - FROM assets - WHERE asset_id = %s - """, - (asset_id,), - ) - row = cur.fetchone() - if not row: - raise RuntimeError(f"Asset not found: {asset_id}") - - _, typ, status, original_url, mime_type, content_hash = row - - # 2. Early exit if already processed - if status in (AssetStatus.READY.value, AssetStatus.DUPLICATE.value): - logger.info("Asset %s already in final state: %s", asset_id, status) - return + with tracer.start_as_current_span("process.dispatch") as span: + # asset_id is fine as a span attribute (high cardinality is OK on traces), + # but must NEVER become a metric label. + span.set_attribute("asset_id", asset_id) - # 3. Proceed with processing - local_raw_file = None - try: - # Mark as processing + # 1. Load asset metadata with pg_pool.get_pg_conn() as conn: cur = conn.cursor() cur.execute( - "UPDATE assets SET status = %s WHERE asset_id = %s", - (AssetStatus.PROCESSING.value, asset_id) + """ + SELECT asset_id, type, status, original_url, mime_type, content_hash + FROM assets + WHERE asset_id = %s + """, + (asset_id,), ) - - # Download raw file - raw_key = f"media/raw/{asset_id}" - tmp_dir = cfg.temp_dir - os.makedirs(tmp_dir, exist_ok=True) - local_raw_file = os.path.join( - tmp_dir, f"{asset_id}-raw.{get_extension_for_mime(mime_type)}" - ) - storage.download_to_file(raw_key, local_raw_file) - - content_hash = compute_file_hash(local_raw_file) - - # Check for duplicate using the actual downloaded file's hash - if content_hash: - dedup_result = check_for_duplicate(content_hash, asset_id, pg_pool) - - if dedup_result == DedupResult.DUPLICATE_READY: - logger.info("Asset %s deduplicated successfully", asset_id) - return - elif dedup_result == DedupResult.DUPLICATE_PENDING: - raise RetryableException( - f"Canonical asset for {asset_id} not ready yet" + row = cur.fetchone() + if not row: + raise RuntimeError(f"Asset not found: {asset_id}") + + _, typ, status, original_url, mime_type, content_hash = row + + span.set_attribute("asset.type", typ or "unknown") + span.set_attribute("asset.status", status or "unknown") + + # 2. Early exit if already processed + if status in (AssetStatus.READY.value, AssetStatus.DUPLICATE.value): + logger.info("Asset %s already in final state: %s", asset_id, status) + span.set_attribute("dispatch.short_circuit", "already_final") + return + + # 3. Proceed with processing + local_raw_file = None + proc_start = time.time() + try: + # Mark as processing + with pg_pool.get_pg_conn() as conn: + cur = conn.cursor() + cur.execute( + "UPDATE assets SET status = %s WHERE asset_id = %s", + (AssetStatus.PROCESSING.value, asset_id) ) - # Process based on type - if typ == "image": - process_image_file( - asset_id, local_raw_file, content_hash, pg_pool, storage, cfg - ) - elif typ == "video": - process_video_file( - asset_id, local_raw_file, content_hash, pg_pool, storage, cfg + # Download raw file + raw_key = f"media/raw/{asset_id}" + tmp_dir = cfg.temp_dir + os.makedirs(tmp_dir, exist_ok=True) + local_raw_file = os.path.join( + tmp_dir, f"{asset_id}-raw.{get_extension_for_mime(mime_type)}" ) - else: - raise ValueError(f"Unknown asset type: {typ}") - - except Exception as e: - # Do not touch assets.status here. The consumer (_handle_job) owns the - # asset state transition: it marks the asset failed only after the retry - # cap is hit, and ready on success. Writing 'failed' on every exception - # — including RetryableException — left the asset stuck failed across - # retries even though the job was still pending. See DEV-34. - logger.error("Failed to process asset %s: %s", asset_id, e, exc_info=True) - raise - finally: - if local_raw_file and os.path.exists(local_raw_file): - try: - os.unlink(local_raw_file) - except OSError: - logger.warning("Failed to delete temp file %s", local_raw_file) + with tracer.start_as_current_span("process.download") as dl_span: + dl_span.set_attribute("asset_id", asset_id) + dl_span.set_attribute("storage.key", raw_key) + storage.download_to_file(raw_key, local_raw_file) + try: + dl_span.set_attribute( + "download.size_bytes", os.path.getsize(local_raw_file) + ) + except OSError: + pass + + content_hash = compute_file_hash(local_raw_file) + + # Check for duplicate using the actual downloaded file's hash + if content_hash: + with tracer.start_as_current_span("process.dedup_check") as dd_span: + dd_span.set_attribute("asset_id", asset_id) + dd_span.set_attribute("content_hash", content_hash) + dedup_result = check_for_duplicate(content_hash, asset_id, pg_pool) + dd_span.set_attribute("dedup.result", dedup_result.value) + + if dedup_result == DedupResult.DUPLICATE_READY: + logger.info("Asset %s deduplicated successfully", asset_id) + span.set_attribute("dispatch.short_circuit", "deduplicated") + return + elif dedup_result == DedupResult.DUPLICATE_PENDING: + raise RetryableException( + f"Canonical asset for {asset_id} not ready yet" + ) + + # Process based on type + proc_start = time.time() + if typ == "image": + process_image_file( + asset_id, local_raw_file, content_hash, pg_pool, storage, cfg + ) + elif typ == "video": + process_video_file( + asset_id, local_raw_file, content_hash, pg_pool, storage, cfg + ) + else: + raise ValueError(f"Unknown asset type: {typ}") + + # Asset processing duration, labelled by type only (low cardinality). + # Feeds the image/video "ready latency" SLIs. asset_id stays a span + # attribute, never a metric label. + wm.record_asset(typ, time.time() - proc_start, success=True) + + except Exception as e: + # Do not touch assets.status here. The consumer (_handle_job) owns the + # asset state transition: it marks the asset failed only after the retry + # cap is hit, and ready on success. Writing 'failed' on every exception + # — including RetryableException — left the asset stuck failed across + # retries even though the job was still pending. See DEV-34. + logger.error("Failed to process asset %s: %s", asset_id, e, exc_info=True) + wm.record_asset(typ, time.time() - proc_start, success=False) + raise + finally: + if local_raw_file and os.path.exists(local_raw_file): + try: + os.unlink(local_raw_file) + except OSError: + logger.warning("Failed to delete temp file %s", local_raw_file) def clone_image_variants(cur, canonical_asset_id: str, new_asset_id: str) -> int: diff --git a/worker/processing/videos.py b/worker/processing/videos.py index 4432e6a..30eabe0 100644 --- a/worker/processing/videos.py +++ b/worker/processing/videos.py @@ -5,15 +5,25 @@ import subprocess import tempfile +from opentelemetry import trace + logger = logging.getLogger("videos") +tracer = trace.get_tracer("worker.processing.videos") def run(cmd: list[str]) -> None: """Execute ffmpeg command with error handling.""" logger.info("ffmpeg: %s", " ".join(cmd)) - res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if res.returncode != 0: - raise RuntimeError(f"FFmpeg failed: {res.stderr.decode()}") + with tracer.start_as_current_span("ffmpeg.exec") as span: + # cmd[0] is the binary (ffmpeg/ffprobe); record it without the full + # argv to avoid leaking paths as high-cardinality span attributes. + span.set_attribute("ffmpeg.binary", cmd[0] if cmd else "") + res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + span.set_attribute("ffmpeg.returncode", res.returncode) + if res.returncode != 0: + err = res.stderr.decode() + span.set_status(trace.StatusCode.ERROR, "ffmpeg failed") + raise RuntimeError(f"FFmpeg failed: {err}") def probe_video(local_path: str) -> dict: @@ -152,9 +162,14 @@ def process_video_file(asset_id, local_raw_path, content_hash, pg_pool, storage, (content_hash, asset_id), ) - generate_poster(asset_id, local_raw_path, storage, cfg, pg_pool) - transcode_720p(asset_id, local_raw_path, storage, cfg, pg_pool) - generate_preview(asset_id, local_raw_path, storage, cfg, pg_pool) + for stage_name, fn in ( + ("video.poster", generate_poster), + ("video.transcode_720p", transcode_720p), + ("video.preview", generate_preview), + ): + with tracer.start_as_current_span(stage_name) as span: + span.set_attribute("asset_id", asset_id) + fn(asset_id, local_raw_path, storage, cfg, pg_pool) # Mark asset ready with pg_pool.get_pg_conn() as conn: diff --git a/worker/tests/test_consumer_tracing.py b/worker/tests/test_consumer_tracing.py new file mode 100644 index 0000000..4e6eca7 --- /dev/null +++ b/worker/tests/test_consumer_tracing.py @@ -0,0 +1,109 @@ +import unittest +from unittest.mock import MagicMock, patch + +from opentelemetry.propagate import set_global_textmap +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) +from opentelemetry.trace.propagation.tracecontext import ( + TraceContextTextMapPropagator, +) + +from worker.consumer.consumer import Consumer + + +TRACE_ID_HEX = "0af7651916cd43dd8448eb211c80319c" +SPAN_ID_HEX = "b7ad6b7169203331" +TRACEPARENT = f"00-{TRACE_ID_HEX}-{SPAN_ID_HEX}-01" + + +def _make_consumer(): + cfg = MagicMock() + cfg.stream_name = "media:jobs" + cfg.consumer_group = "media-workers" + with patch("worker.consumer.consumer.redis.Redis.from_url") as from_url: + from_url.return_value = MagicMock() + consumer = Consumer( + pg_pool=MagicMock(), redis_url="redis://x", storage=MagicMock(), cfg=cfg + ) + return consumer, from_url.return_value + + +class TestConsumeSpanPropagation(unittest.TestCase): + """Phase 1d: the worker continues the producer trace across the queue.""" + + def setUp(self): + set_global_textmap(TraceContextTextMapPropagator()) + self.exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(self.exporter)) + self.tracer = provider.get_tracer("test") + + def _run_consume_with(self, fields): + consumer, client = _make_consumer() + client.xreadgroup.return_value = [("media:jobs", [("1-0", fields)])] + consumer._recover_stuck_pending = MagicMock() + consumer._handle_job = MagicMock() + consumer._handle_asset_message = MagicMock() + with patch( + "worker.consumer.consumer.get_tracer", return_value=self.tracer + ): + consumer.consume("worker-1") + return consumer + + def test_consume_span_is_child_and_linked_to_producer(self): + consumer = self._run_consume_with({"job_id": "42", "traceparent": TRACEPARENT}) + consumer._handle_job.assert_called_once_with("42", "1-0") + + spans = self.exporter.get_finished_spans() + consume = next(s for s in spans if s.name == "worker.consume") + + expected_trace_id = int(TRACE_ID_HEX, 16) + expected_span_id = int(SPAN_ID_HEX, 16) + + # Child: parent is the producer context, and the span continues the trace. + self.assertIsNotNone(consume.parent) + self.assertEqual(consume.parent.trace_id, expected_trace_id) + self.assertEqual(consume.parent.span_id, expected_span_id) + self.assertEqual(consume.context.trace_id, expected_trace_id) + + # Link: queue fan-in primitive points at the same producer context. + self.assertTrue(consume.links) + self.assertEqual(consume.links[0].context.trace_id, expected_trace_id) + + def test_consume_without_traceparent_starts_new_trace(self): + consumer = self._run_consume_with({"job_id": "42"}) + consumer._handle_job.assert_called_once_with("42", "1-0") + spans = self.exporter.get_finished_spans() + consume = next(s for s in spans if s.name == "worker.consume") + # No producer context -> root span, no link. + self.assertIsNone(consume.parent) + self.assertFalse(consume.links) + + def test_traceparent_merged_from_body(self): + import json + + body = json.dumps({"job_id": "42"}) + consumer = self._run_consume_with({"body": body, "traceparent": TRACEPARENT}) + consumer._handle_job.assert_called_once_with("42", "1-0") + spans = self.exporter.get_finished_spans() + consume = next(s for s in spans if s.name == "worker.consume") + self.assertEqual(consume.parent.trace_id, int(TRACE_ID_HEX, 16)) + + +class TestTracerInit(unittest.TestCase): + def test_init_tracing_sets_tracer(self): + import worker.utils.tracing as tracing + + # Reset module state for a clean init. + tracing._tracer = None + tracing._provider = None + tracing.init_tracing(endpoint="otel-collector:4317", deployment_env="local") + self.assertIsNotNone(tracing.get_tracer()) + tracing.shutdown_tracing() + + +if __name__ == "__main__": + unittest.main() diff --git a/worker/tests/test_logging_correlation.py b/worker/tests/test_logging_correlation.py new file mode 100644 index 0000000..6962fcc --- /dev/null +++ b/worker/tests/test_logging_correlation.py @@ -0,0 +1,38 @@ +import logging +import unittest + +from opentelemetry.sdk.trace import TracerProvider + +from worker.utils.logger import TraceContextFilter + + +class TestTraceContextFilter(unittest.TestCase): + """Phase 2b: log records carry the active span's trace_id/span_id.""" + + def _record(self): + return logging.LogRecord( + name="t", level=logging.INFO, pathname=__file__, lineno=1, + msg="hello", args=(), exc_info=None, + ) + + def test_no_span_emits_empty(self): + f = TraceContextFilter() + rec = self._record() + f.filter(rec) + self.assertEqual(rec.trace_id, "") + self.assertEqual(rec.span_id, "") + + def test_active_span_stamps_ids(self): + provider = TracerProvider() + tracer = provider.get_tracer("test") + f = TraceContextFilter() + with tracer.start_as_current_span("s"): + rec = self._record() + f.filter(rec) + self.assertEqual(len(rec.trace_id), 32) + self.assertEqual(len(rec.span_id), 16) + self.assertRegex(rec.trace_id, r"^[0-9a-f]{32}$") + + +if __name__ == "__main__": + unittest.main() diff --git a/worker/tests/test_pipeline_spans.py b/worker/tests/test_pipeline_spans.py new file mode 100644 index 0000000..b223c4f --- /dev/null +++ b/worker/tests/test_pipeline_spans.py @@ -0,0 +1,77 @@ +import unittest +from unittest.mock import MagicMock, patch + +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) +from opentelemetry import trace + +import worker.processing.processor as processor + + +class TestPipelineStageSpans(unittest.TestCase): + """Phase 2a: dispatch emits download + dedup (+ delegate) spans.""" + + def setUp(self): + self.exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(self.exporter)) + # Point the module-level proxy tracer at our in-memory provider. + self._tracer = provider.get_tracer("test") + self._orig = processor.tracer + processor.tracer = self._tracer + + def tearDown(self): + processor.tracer = self._orig + + def _pg_pool_returning(self, asset_row): + cursor = MagicMock() + cursor.fetchone.return_value = asset_row + conn = MagicMock() + conn.cursor.return_value = cursor + pg = MagicMock() + pg.get_pg_conn.return_value.__enter__.return_value = conn + return pg + + @patch("worker.processing.processor.get_extension_for_mime", return_value="jpg") + @patch("worker.processing.processor.compute_file_hash", return_value="") + @patch("worker.processing.processor.process_image_file") + @patch("worker.processing.processor.os.path.exists", return_value=False) + def test_image_dispatch_emits_stage_spans( + self, _exists, mock_img, _hash, _ext + ): + asset_row = ("a1", "image", "uploaded", "u", "image/jpeg", None) + pg = self._pg_pool_returning(asset_row) + storage = MagicMock() + cfg = MagicMock() + cfg.temp_dir = "/tmp" + + processor.process_asset_dispatch("a1", pg, storage, cfg) + + names = {s.name for s in self.exporter.get_finished_spans()} + self.assertIn("process.dispatch", names) + self.assertIn("process.download", names) + mock_img.assert_called_once() + + @patch("worker.processing.processor.get_extension_for_mime", return_value="jpg") + @patch("worker.processing.processor.compute_file_hash", return_value="abc123") + @patch("worker.processing.processor.check_for_duplicate") + @patch("worker.processing.processor.process_image_file") + @patch("worker.processing.processor.os.path.exists", return_value=False) + def test_dedup_span_emitted_when_hash_present( + self, _exists, mock_img, mock_dedup, _hash, _ext + ): + mock_dedup.return_value = processor.DedupResult.NO_DUPLICATE + asset_row = ("a1", "image", "uploaded", "u", "image/jpeg", None) + pg = self._pg_pool_returning(asset_row) + + processor.process_asset_dispatch("a1", pg, MagicMock(), MagicMock(temp_dir="/tmp")) + + names = {s.name for s in self.exporter.get_finished_spans()} + self.assertIn("process.dedup_check", names) + + +if __name__ == "__main__": + unittest.main() diff --git a/worker/utils/logger.py b/worker/utils/logger.py index bbc642c..6b2122b 100644 --- a/worker/utils/logger.py +++ b/worker/utils/logger.py @@ -2,8 +2,33 @@ import os from typing import Optional +from opentelemetry import trace -_DEFAULT_FORMAT = "%(asctime)s %(levelname)s [%(name)s] %(message)s" + +# trace_id= matches the Grafana Loki derived-field regex (trace_id=(\w+)), +# which links each log line to its Tempo trace. span_id is included for context. +_DEFAULT_FORMAT = ( + "%(asctime)s %(levelname)s [%(name)s] " + "trace_id=%(trace_id)s span_id=%(span_id)s %(message)s" +) + + +class TraceContextFilter(logging.Filter): + """Inject the active span's trace_id/span_id into every log record. + + Emits empty strings when there is no active recording span, so the Grafana + derived field does not create a link to a non-existent trace. + """ + + def filter(self, record: logging.LogRecord) -> bool: + ctx = trace.get_current_span().get_span_context() + if ctx is not None and ctx.is_valid: + record.trace_id = format(ctx.trace_id, "032x") + record.span_id = format(ctx.span_id, "016x") + else: + record.trace_id = "" + record.span_id = "" + return True def setup_logging( @@ -35,6 +60,12 @@ def setup_logging( format=fmt, ) + # Attach the trace-context filter at the handler level so it stamps every + # record flowing through, regardless of which logger emitted it. + trace_filter = TraceContextFilter() + for handler in logging.getLogger().handlers: + handler.addFilter(trace_filter) + # Silence noisy libraries (optional, but recommended) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("botocore").setLevel(logging.WARNING) diff --git a/worker/utils/metrics.py b/worker/utils/metrics.py index 82af1be..27d1004 100644 --- a/worker/utils/metrics.py +++ b/worker/utils/metrics.py @@ -18,6 +18,10 @@ from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry.sdk.metrics.view import ( + ExplicitBucketHistogramAggregation, + View, +) from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION, DEPLOYMENT_ENVIRONMENT, SERVICE_INSTANCE_ID from worker.utils.logger import get_logger @@ -97,8 +101,27 @@ def init_metrics( # Create metric reader with 15-second export interval reader = PeriodicExportingMetricReader(exporter, export_interval_millis=15000) + # Finer histogram buckets for duration metrics. The SDK default buckets are + # too coarse for sub-second work (everything lands in [0,5) so p95 reads + # ~4.75s), which makes the image/job latency SLIs meaningless. These cover + # tens-of-ms (images) through tens-of-seconds (video transcode). + _duration_buckets = [0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300] + duration_views = [ + View( + instrument_name=name, + aggregation=ExplicitBucketHistogramAggregation(_duration_buckets), + ) + for name in ( + "mpiper.asset.processing.duration", + "mpiper.job.processing.duration", + "mpiper.queue.processing.duration", + ) + ] + # Create meter provider - provider = MeterProvider(resource=resource, metric_readers=[reader]) + provider = MeterProvider( + resource=resource, metric_readers=[reader], views=duration_views + ) metrics.set_meter_provider(provider) # Get meter @@ -220,6 +243,40 @@ def init_metrics( logger.info("OpenTelemetry metrics initialized successfully") +def record_consume() -> None: + """Count one consumed queue message (no-op until init_metrics runs).""" + if queue_message_consumed is not None: + queue_message_consumed.add(1) + + +def record_job(success: bool, duration_seconds: float) -> None: + """Record job-level outcome + duration. Safe before init (no-op).""" + if job_processing_total is not None: + job_processing_total.add(1) + if success and job_processing_success is not None: + job_processing_success.add(1) + if (not success) and job_processing_failed is not None: + job_processing_failed.add(1) + if job_processing_duration is not None: + job_processing_duration.record(duration_seconds) + + +def record_asset(asset_type: str, duration_seconds: float, success: bool) -> None: + """Record asset processing outcome + duration, labelled by type only. + + asset_type is low-cardinality (image/video); asset_id must never be a label. + """ + attrs = {"asset_type": asset_type or "unknown"} + if asset_processing_total is not None: + asset_processing_total.add(1, attrs) + if success and asset_processing_success is not None: + asset_processing_success.add(1, attrs) + if (not success) and asset_processing_failed is not None: + asset_processing_failed.add(1, attrs) + if asset_processing_duration is not None: + asset_processing_duration.record(duration_seconds, attrs) + + def get_meter() -> Optional[metrics.Meter]: """Get the global meter instance. diff --git a/worker/utils/tracing.py b/worker/utils/tracing.py new file mode 100644 index 0000000..c2cadf5 --- /dev/null +++ b/worker/utils/tracing.py @@ -0,0 +1,121 @@ +""" +worker.utils.tracing + +OpenTelemetry tracing initialization for the Python worker. + +Mirrors `worker.utils.metrics`: an OTLP gRPC exporter to the same collector +endpoint, a BatchSpanProcessor, and the SAME W3C propagators as the Go API +(`traceparent` + `baggage`) so the trace continues across the Redis boundary +instead of starting fresh. + +The worker had OTel metric instruments but no tracer and no context extraction, +so the distributed trace died at the queue. This closes that gap on the consumer +side; `worker.consumer.consumer` extracts the producer context and starts the +consume span as a child (with a link) of it. +""" + +from typing import Optional + +from opentelemetry import trace +from opentelemetry.baggage.propagation import W3CBaggagePropagator +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.propagate import set_global_textmap +from opentelemetry.propagators.composite import CompositePropagator +from opentelemetry.sdk.resources import ( + DEPLOYMENT_ENVIRONMENT, + SERVICE_INSTANCE_ID, + SERVICE_NAME, + SERVICE_VERSION, + Resource, +) +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace.sampling import ALWAYS_ON, ParentBased, TraceIdRatioBased +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator + +from worker.utils.logger import get_logger + +logger = get_logger(__name__) + +# Global tracer for the worker +_tracer: Optional[trace.Tracer] = None +_provider: Optional[TracerProvider] = None + + +def _build_sampler(deployment_env: str, sampling_rate: float): + """AlwaysSample in dev/local; parent-based ratio sampling otherwise. + + Matches the Go API's getSampler() so both services agree on what to keep. + """ + if deployment_env in ("development", "dev", "local", ""): + return ALWAYS_ON + return ParentBased(root=TraceIdRatioBased(sampling_rate)) + + +def init_tracing( + service_name: str = "mpiper-worker", + service_version: str = "dev", + endpoint: str = "otel-collector:4317", + deployment_env: str = "development", + instance_id: Optional[str] = None, + tls_insecure: bool = True, + sampling_rate: float = 1.0, +) -> None: + """Initialize OpenTelemetry tracing with an OTLP gRPC span exporter. + + Parameters should be sourced from the centralised config (get_config().otel). + Idempotent: a second call is a no-op so the worker can call it safely on + startup alongside init_metrics. + """ + global _tracer, _provider + + if _tracer is not None: + logger.warning("Tracing already initialized") + return + + if "://" in endpoint: + endpoint = endpoint.split("://", 1)[1] + + logger.info(f"Initializing OpenTelemetry tracer with endpoint: {endpoint}") + + resource = Resource.create( + { + SERVICE_NAME: service_name, + SERVICE_VERSION: service_version, + DEPLOYMENT_ENVIRONMENT: deployment_env, + SERVICE_INSTANCE_ID: instance_id or service_name, + } + ) + + exporter = OTLPSpanExporter(endpoint=endpoint, insecure=tls_insecure) + + provider = TracerProvider( + resource=resource, + sampler=_build_sampler(deployment_env, sampling_rate), + ) + provider.add_span_processor(BatchSpanProcessor(exporter)) + trace.set_tracer_provider(provider) + + # Same propagators as the Go API (composite TraceContext + Baggage) so the + # traceparent the producer injected is understood here. + set_global_textmap( + CompositePropagator( + [TraceContextTextMapPropagator(), W3CBaggagePropagator()] + ) + ) + + _provider = provider + _tracer = trace.get_tracer(__name__) + logger.info("OpenTelemetry tracer initialized successfully") + + +def get_tracer() -> Optional[trace.Tracer]: + """Return the global worker tracer, or None if init_tracing was not called.""" + return _tracer + + +def shutdown_tracing() -> None: + """Flush and shut down the tracer provider on exit.""" + if _provider is not None: + _provider.shutdown() + logger.info("Tracer provider shutdown complete")