diff --git a/.env.example b/.env.example index 49aaf69..fcd8091 100644 --- a/.env.example +++ b/.env.example @@ -12,6 +12,10 @@ SERVER_PORT=8080 # Logging LOG_LEVEL=info +# Metrics configuration +METRICS_ENABLED=true +METRICS_NAMESPACE=secrets + # Master keys (Envelope Encryption) # Generate a new master key using: ./bin/app create-master-key # Each key must be exactly 32 bytes (256 bits), base64-encoded diff --git a/README.md b/README.md index 88b1655..df38c91 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Secrets is inspired by **HashiCorp Vault** โค๏ธ, but it is intentionally **muc The default way to run Secrets is the published Docker image: ```bash -docker pull allisson/secrets:v0.2.0 +docker pull allisson/secrets:v0.3.0 ``` Use pinned tags for reproducible setups. `latest` is also available for fast iteration. @@ -27,6 +27,13 @@ Then follow the Docker setup guide in [docs/getting-started/docker.md](docs/gett 1. ๐Ÿณ **Run with Docker image (recommended)**: [docs/getting-started/docker.md](docs/getting-started/docker.md) 2. ๐Ÿ’ป **Run locally for development**: [docs/getting-started/local-development.md](docs/getting-started/local-development.md) +## ๐Ÿ†• What's New in v0.3.0 + +- ๐Ÿ“Š OpenTelemetry metrics with Prometheus-compatible export at `GET /metrics` +- โš™๏ธ Runtime metrics controls via `METRICS_ENABLED` and `METRICS_NAMESPACE` +- ๐Ÿ“ˆ HTTP and business-operation metrics for auth, secrets, and transit flows +- ๐Ÿ“˜ New monitoring operations guide: [docs/operations/monitoring.md](docs/operations/monitoring.md) + ## ๐Ÿ“š Docs Map - **Start Here** @@ -36,7 +43,7 @@ Then follow the Docker setup guide in [docs/getting-started/docker.md](docs/gett - ๐Ÿงฐ **Troubleshooting**: [docs/getting-started/troubleshooting.md](docs/getting-started/troubleshooting.md) - โœ… **Smoke test script**: [docs/getting-started/smoke-test.md](docs/getting-started/smoke-test.md) - ๐Ÿงช **CLI commands reference**: [docs/cli/commands.md](docs/cli/commands.md) -- ๐Ÿš€ **v0.2.0 release notes**: [docs/releases/v0.2.0.md](docs/releases/v0.2.0.md) +- ๐Ÿš€ **v0.3.0 release notes**: [docs/releases/v0.3.0.md](docs/releases/v0.3.0.md) - **By Topic** - โš™๏ธ **Environment variables**: [docs/configuration/environment-variables.md](docs/configuration/environment-variables.md) @@ -44,6 +51,7 @@ Then follow the Docker setup guide in [docs/getting-started/docker.md](docs/gett - ๐Ÿ”’ **Security model**: [docs/concepts/security-model.md](docs/concepts/security-model.md) - ๐Ÿ“˜ **Glossary**: [docs/concepts/glossary.md](docs/concepts/glossary.md) - ๐Ÿ”‘ **Key management operations**: [docs/operations/key-management.md](docs/operations/key-management.md) +- ๐Ÿ“Š **Monitoring and metrics**: [docs/operations/monitoring.md](docs/operations/monitoring.md) - ๐Ÿš‘ **Failure playbooks**: [docs/operations/failure-playbooks.md](docs/operations/failure-playbooks.md) - ๐Ÿญ **Production deployment**: [docs/operations/production.md](docs/operations/production.md) - ๐Ÿ› ๏ธ **Development and testing**: [docs/development/testing.md](docs/development/testing.md) @@ -74,6 +82,7 @@ All detailed guides include practical use cases and copy/paste-ready examples. - ๐Ÿ‘ค Token-based authentication and policy-based authorization - ๐Ÿ“ฆ Versioned secrets by path (`/v1/secrets/*path`) - ๐Ÿ“œ Audit logs with request correlation (`request_id`) and filtering +- ๐Ÿ“Š OpenTelemetry metrics with Prometheus-compatible `/metrics` export ## ๐ŸŒ API Overview @@ -84,6 +93,7 @@ All detailed guides include practical use cases and copy/paste-ready examples. - Secrets: `POST/GET/DELETE /v1/secrets/*path` - Transit: `POST /v1/transit/keys`, `POST /v1/transit/keys/:name/rotate`, `POST /v1/transit/keys/:name/encrypt`, `POST /v1/transit/keys/:name/decrypt`, `DELETE /v1/transit/keys/:id` ([create vs rotate](docs/api/transit.md#create-vs-rotate), [error matrix](docs/api/transit.md#endpoint-error-matrix)) - Audit logs: `GET /v1/audit-logs` +- Metrics: `GET /metrics` (available when `METRICS_ENABLED=true`) ## ๐Ÿ“„ License diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index f60e5ff..c30534c 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,6 +1,24 @@ # ๐Ÿ—’๏ธ Documentation Changelog -> Last updated: 2026-02-14 +> Last updated: 2026-02-16 + +## 2026-02-16 (docs v5 - documentation quality improvements) + +- Added `What's New in v0.3.0` section to root `README.md` +- Added Prometheus + Grafana quickstart and a metrics naming contract to `docs/operations/monitoring.md` +- Added production hardening guidance for securing `/metrics` exposure +- Added feature PR docs consistency checklist to `docs/contributing.md` +- Added metrics troubleshooting matrix to `docs/getting-started/troubleshooting.md` +- Added local and Docker command parity examples in `docs/cli/commands.md` +- Added telemetry breaking vs non-breaking examples in `docs/api/versioning-policy.md` + +## 2026-02-16 (docs v4 - v0.3.0 release prep) + +- Added release notes page: `docs/releases/v0.3.0.md` and set it as the current release in docs indexes +- Updated pinned Docker examples from `allisson/secrets:v0.2.0` to `allisson/secrets:v0.3.0` +- Added monitoring links to root README and expanded API overview with `GET /metrics` +- Aligned monitoring operations with implementation (`secret_create`, `secret_get_version`, `audit_log_delete`, `transit_key_rotate`) +- Clarified metrics disable behavior (`METRICS_ENABLED=false` removes metrics middleware and `/metrics` route) ## 2026-02-14 (docs v3 - v0.2.0 release prep) diff --git a/docs/README.md b/docs/README.md index 799928b..2bbb122 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ # ๐Ÿ“š Secrets Documentation -> Last updated: 2026-02-14 +> Last updated: 2026-02-16 Welcome to the full documentation for Secrets. Pick a path and dive in ๐Ÿš€ @@ -25,6 +25,7 @@ Welcome to the full documentation for Secrets. Pick a path and dive in ๐Ÿš€ - ๐Ÿ”’ [concepts/security-model.md](concepts/security-model.md) - ๐Ÿ“˜ [concepts/glossary.md](concepts/glossary.md) - ๐Ÿ”‘ [operations/key-management.md](operations/key-management.md) +- ๐Ÿ“Š [operations/monitoring.md](operations/monitoring.md) - ๐Ÿญ [operations/production.md](operations/production.md) - ๐Ÿš‘ [operations/failure-playbooks.md](operations/failure-playbooks.md) - ๐Ÿ› ๏ธ [development/testing.md](development/testing.md) @@ -55,12 +56,13 @@ Welcome to the full documentation for Secrets. Pick a path and dive in ๐Ÿš€ OpenAPI scope note: -- `openapi.yaml` is a baseline subset for common API flows in `v0.2.0` +- `openapi.yaml` is a baseline subset for common API flows in `v0.3.0` - Full endpoint behavior is documented in the endpoint pages under `docs/api/` ## ๐Ÿš€ Releases -- ๐Ÿ“ฆ [releases/v0.2.0.md](releases/v0.2.0.md) +- ๐Ÿ“ฆ [releases/v0.3.0.md](releases/v0.3.0.md) +- ๐Ÿ“ฆ [releases/v0.2.0.md](releases/v0.2.0.md) (historical) - ๐Ÿ“ฆ [releases/v0.1.0.md](releases/v0.1.0.md) (historical) ## ๐Ÿง  ADRs diff --git a/docs/api/versioning-policy.md b/docs/api/versioning-policy.md index 9ab14ee..641bc50 100644 --- a/docs/api/versioning-policy.md +++ b/docs/api/versioning-policy.md @@ -1,6 +1,6 @@ # ๐Ÿงฉ API Compatibility and Versioning Policy -> Last updated: 2026-02-14 +> Last updated: 2026-02-16 > Applies to: API v1 This page defines compatibility expectations for HTTP API changes. @@ -11,7 +11,7 @@ This page defines compatibility expectations for HTTP API changes. - Existing endpoint paths and JSON field names are treated as stable unless explicitly deprecated - OpenAPI source of truth: `docs/openapi.yaml` -## OpenAPI Coverage (v0.2.0) +## OpenAPI Coverage (v0.3.0) - `docs/openapi.yaml` is a baseline subset focused on high-traffic/common integration flows - Endpoint pages in `docs/api/*.md` define full public behavior for covered operations @@ -19,7 +19,7 @@ This page defines compatibility expectations for HTTP API changes. ## App Version vs API Version -- Application release `v0.2.0` is pre-1.0 software and may evolve quickly +- Application release `v0.3.0` is pre-1.0 software and may evolve quickly - API v1 path contract (`/v1/*`) remains the compatibility baseline for consumers - Breaking API behavior changes require explicit documentation and migration notes @@ -48,6 +48,19 @@ Usually non-breaking: - clarifying documentation text and examples - adding additional error examples without changing behavior +## Telemetry Change Examples + +Breaking telemetry examples: + +- renaming a published metric name (for example `secrets_http_requests_total`) +- renaming/removing metric labels used by dashboards or alerts + +Non-breaking telemetry examples: + +- adding a new metric family +- adding new label values for existing labels +- adding new dashboard examples without changing metric contracts + ## Deprecation Guidance - Mark deprecated behavior clearly in endpoint docs diff --git a/docs/cli/commands.md b/docs/cli/commands.md index 6c02dd5..8ffabe2 100644 --- a/docs/cli/commands.md +++ b/docs/cli/commands.md @@ -1,6 +1,6 @@ # ๐Ÿงช CLI Commands Reference -> Last updated: 2026-02-14 +> Last updated: 2026-02-16 Use the `app` CLI for server runtime, key management, and client lifecycle operations. @@ -12,10 +12,10 @@ Local binary: ./bin/app [flags] ``` -Docker image (v0.2.0): +Docker image (v0.3.0): ```bash -docker run --rm --env-file .env allisson/secrets:v0.2.0 [flags] +docker run --rm --env-file .env allisson/secrets:v0.3.0 [flags] ``` ## Core Runtime @@ -24,18 +24,34 @@ docker run --rm --env-file .env allisson/secrets:v0.2.0 [flags] Starts the HTTP API server. +Local: + ```bash ./bin/app server ``` +Docker: + +```bash +docker run --rm --network secrets-net --env-file .env -p 8080:8080 allisson/secrets:v0.3.0 server +``` + ### `migrate` Runs database migrations. +Local: + ```bash ./bin/app migrate ``` +Docker: + +```bash +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.3.0 migrate +``` + ## Key Management ### `create-master-key` @@ -46,10 +62,18 @@ Flags: - `--id`, `-i`: master key ID +Local: + ```bash ./bin/app create-master-key --id default ``` +Docker: + +```bash +docker run --rm allisson/secrets:v0.3.0 create-master-key --id default +``` + ### `create-kek` Creates an initial KEK from the active master key. @@ -58,10 +82,18 @@ Flags: - `--algorithm`, `--alg`: `aes-gcm` (default) or `chacha20-poly1305` +Local: + ```bash ./bin/app create-kek --algorithm aes-gcm ``` +Docker: + +```bash +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.3.0 create-kek --algorithm aes-gcm +``` + ### `rotate-kek` Rotates KEK to a new version. @@ -70,10 +102,18 @@ Flags: - `--algorithm`, `--alg`: `aes-gcm` (default) or `chacha20-poly1305` +Local: + ```bash ./bin/app rotate-kek --algorithm aes-gcm ``` +Docker: + +```bash +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.3.0 rotate-kek --algorithm aes-gcm +``` + After master key or KEK rotation, restart API server instances so they load updated key material. ## Client Management @@ -155,8 +195,9 @@ Examples: ./bin/app clean-audit-logs --days 90 --format text # Docker form -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.2.0 \ +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.3.0 \ clean-audit-logs --days 90 --dry-run --format json + ``` Example text output: diff --git a/docs/configuration/environment-variables.md b/docs/configuration/environment-variables.md index fc26d05..9217497 100644 --- a/docs/configuration/environment-variables.md +++ b/docs/configuration/environment-variables.md @@ -1,6 +1,6 @@ # โš™๏ธ Environment Variables -> Last updated: 2026-02-14 +> Last updated: 2026-02-16 Secrets is configured through environment variables. @@ -21,6 +21,9 @@ MASTER_KEYS=default:BASE64_32_BYTE_KEY ACTIVE_MASTER_KEY_ID=default AUTH_TOKEN_EXPIRATION_SECONDS=86400 + +METRICS_ENABLED=true +METRICS_NAMESPACE=secrets ``` ## Notes @@ -29,6 +32,9 @@ AUTH_TOKEN_EXPIRATION_SECONDS=86400 - ๐Ÿ“ Each master key must represent exactly 32 bytes (256 bits) - โญ `ACTIVE_MASTER_KEY_ID` selects which master key encrypts new KEKs - โฑ๏ธ `AUTH_TOKEN_EXPIRATION_SECONDS` defaults to 24h behavior when set to `86400` +- ๐Ÿ“Š `METRICS_ENABLED` enables/disables OpenTelemetry metrics collection (default: `true`) +- ๐Ÿท๏ธ `METRICS_NAMESPACE` sets the prefix for all metric names (default: `secrets`) +- ๐Ÿ“‰ When `METRICS_ENABLED=false`, HTTP metrics middleware and the `/metrics` route are disabled - ๐Ÿ”„ After changing `MASTER_KEYS` or `ACTIVE_MASTER_KEY_ID`, restart API servers to load new values ## Master key generation @@ -45,6 +51,7 @@ docker run --rm allisson/secrets:latest create-master-key --id default ## See also +- [Monitoring](../operations/monitoring.md) - [Docker getting started](../getting-started/docker.md) - [Local development](../getting-started/local-development.md) - [Production operations](../operations/production.md) diff --git a/docs/contributing.md b/docs/contributing.md index f02b635..5e85f54 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,6 +1,6 @@ # ๐Ÿค Documentation Contributing Guide -> Last updated: 2026-02-14 +> Last updated: 2026-02-16 Use this guide when adding or editing project documentation. @@ -66,6 +66,16 @@ This target runs markdown linting and offline markdown link validation. 4. Terminology is consistent across files 5. `docs/CHANGELOG.md` updated for significant documentation changes +## Feature PR Docs Consistency Checklist + +For behavior changes, update all relevant docs in the same PR: + +1. Environment variables and defaults (`docs/configuration/environment-variables.md`) +2. API overview and endpoint pages (`README.md`, `docs/api/*.md`) +3. Operational runbooks (`docs/operations/*.md`) +4. Release notes (`docs/releases/vX.Y.Z.md`) and `docs/CHANGELOG.md` +5. Local and Docker examples (`docs/getting-started/*.md`, `docs/cli/commands.md`) + ## Ownership and Review Cadence - Docs owners: project maintainers and reviewers for touched domain (`api`, `operations`, `security`) diff --git a/docs/getting-started/docker.md b/docs/getting-started/docker.md index d580424..2ff5f65 100644 --- a/docs/getting-started/docker.md +++ b/docs/getting-started/docker.md @@ -1,10 +1,10 @@ # ๐Ÿณ Run with Docker (Recommended) -> Last updated: 2026-02-14 +> Last updated: 2026-02-16 This is the default way to run Secrets. -For release reproducibility, this guide uses the pinned image tag `allisson/secrets:v0.2.0`. +For release reproducibility, this guide uses the pinned image tag `allisson/secrets:v0.3.0`. You can use `allisson/secrets:latest` for fast iteration. ## โšก Quickstart Copy Block @@ -12,7 +12,7 @@ You can use `allisson/secrets:latest` for fast iteration. Use this minimal flow when you just want to get a working instance quickly: ```bash -docker pull allisson/secrets:v0.2.0 +docker pull allisson/secrets:v0.3.0 docker network create secrets-net || true docker run -d --name secrets-postgres --network secrets-net \ @@ -21,19 +21,19 @@ docker run -d --name secrets-postgres --network secrets-net \ -e POSTGRES_DB=mydb \ postgres:16-alpine -docker run --rm allisson/secrets:v0.2.0 create-master-key --id default +docker run --rm allisson/secrets:v0.3.0 create-master-key --id default # copy generated MASTER_KEYS and ACTIVE_MASTER_KEY_ID into .env -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.2.0 migrate -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.2.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.3.0 migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.3.0 create-kek --algorithm aes-gcm docker run --rm --name secrets-api --network secrets-net --env-file .env -p 8080:8080 \ - allisson/secrets:v0.2.0 server + allisson/secrets:v0.3.0 server ``` ## 1) Pull the image ```bash -docker pull allisson/secrets:v0.2.0 +docker pull allisson/secrets:v0.3.0 ``` ## 2) Start PostgreSQL @@ -51,7 +51,7 @@ docker run -d --name secrets-postgres --network secrets-net \ ## 3) Generate a master key ```bash -docker run --rm allisson/secrets:v0.2.0 create-master-key --id default +docker run --rm allisson/secrets:v0.3.0 create-master-key --id default ``` Copy the generated values into a local `.env` file. @@ -74,21 +74,24 @@ MASTER_KEYS=default:REPLACE_WITH_BASE64_32_BYTE_KEY ACTIVE_MASTER_KEY_ID=default AUTH_TOKEN_EXPIRATION_SECONDS=86400 + +METRICS_ENABLED=true +METRICS_NAMESPACE=secrets EOF ``` ## 5) Run migrations and bootstrap KEK ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.2.0 migrate -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.2.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.3.0 migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.3.0 create-kek --algorithm aes-gcm ``` ## 6) Start the API server ```bash docker run --rm --name secrets-api --network secrets-net --env-file .env -p 8080:8080 \ - allisson/secrets:v0.2.0 server + allisson/secrets:v0.3.0 server ``` ## 7) Verify @@ -108,7 +111,7 @@ Expected: Use the CLI command to create your first API client and policy set: ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.2.0 create-client \ +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.3.0 create-client \ --name bootstrap-admin \ --active \ --policies '[{"path":"*","capabilities":["read","write","delete","encrypt","decrypt","rotate"]}]' \ diff --git a/docs/getting-started/troubleshooting.md b/docs/getting-started/troubleshooting.md index 7dcdb05..405964b 100644 --- a/docs/getting-started/troubleshooting.md +++ b/docs/getting-started/troubleshooting.md @@ -1,6 +1,6 @@ # ๐Ÿงฐ Troubleshooting -> Last updated: 2026-02-14 +> Last updated: 2026-02-16 Use this guide for common setup and runtime errors. @@ -14,6 +14,7 @@ Use this quick route before diving into detailed sections: 4. API requests return `422` -> go to `422 Unprocessable Entity` (payload/query format) 5. After rotating keys behavior is stale -> go to `Rotation completed but server still uses old key context` 6. Startup fails with key config errors -> go to `Missing or Invalid Master Keys` +7. Monitoring data is missing -> go to `Metrics Troubleshooting Matrix` ## ๐Ÿ“‘ Table of Contents @@ -25,6 +26,7 @@ Use this quick route before diving into detailed sections: - [Migration failure](#migration-failure) - [Missing or Invalid Master Keys](#missing-or-invalid-master-keys) - [Missing KEK](#missing-kek) +- [Metrics Troubleshooting Matrix](#metrics-troubleshooting-matrix) - [Rotation completed but server still uses old key context](#rotation-completed-but-server-still-uses-old-key-context) - [Token issuance fails with valid-looking credentials](#token-issuance-fails-with-valid-looking-credentials) - [Quick diagnostics checklist](#quick-diagnostics-checklist) @@ -119,6 +121,16 @@ Common 422 cases: - run `create-kek` once after migration - verify key creation logs +## Metrics Troubleshooting Matrix + +| Symptom | Likely cause | Fix | +| --- | --- | --- | +| `GET /metrics` returns `404` | `METRICS_ENABLED=false` or server restarted with metrics disabled | Set `METRICS_ENABLED=true` and restart server | +| Prometheus scrape target is down | Wrong host/port or network path | Verify target URL and network reachability from Prometheus | +| Metrics present but missing expected prefix | Unexpected namespace value | Confirm `METRICS_NAMESPACE` and update queries/dashboards | +| Dashboards show empty values for paths | Query uses concrete URLs, not route patterns | Query by route pattern labels (for example `/v1/secrets/*path`) | +| Prometheus memory growth or slow queries | High-cardinality query patterns | Aggregate by stable labels and avoid per-request dimensions | + ## Rotation completed but server still uses old key context - Symptom: master key/KEK rotation completed, but runtime behavior suggests old values are still in use diff --git a/docs/operations/monitoring.md b/docs/operations/monitoring.md new file mode 100644 index 0000000..4529977 --- /dev/null +++ b/docs/operations/monitoring.md @@ -0,0 +1,359 @@ +# ๐Ÿ“Š Monitoring + +> Last updated: 2026-02-16 + +This document describes the metrics instrumentation and monitoring capabilities in the Secrets application. + +## Overview + +The application uses OpenTelemetry for metrics instrumentation with a Prometheus-compatible export endpoint. Metrics can be enabled/disabled via configuration and cover two main areas: + +1. **Business Operations** - Domain-specific operation counters and durations +2. **HTTP Requests** - Request counts and response times + +## Configuration + +### Environment Variables + +```bash +# Enable or disable metrics collection +METRICS_ENABLED=true # default: true + +# Namespace prefix for all metrics +METRICS_NAMESPACE=secrets # default: secrets +``` + +Update your `.env` file: + +```bash +# Metrics configuration +METRICS_ENABLED=true +METRICS_NAMESPACE=secrets +``` + +## Quickstart (Prometheus + Grafana) + +Use this minimal local stack to visualize Secrets metrics quickly: + +1. Start Secrets with metrics enabled +2. Start Prometheus with a scrape config for `http://host.docker.internal:8080/metrics` +3. Open Grafana and create panels from Prometheus queries + +Note: On Linux, replace `host.docker.internal` with the host IP reachable from your Docker network. + +Minimal `prometheus.yml`: + +```yaml +global: + scrape_interval: 15s + +scrape_configs: + - job_name: "secrets" + static_configs: + - targets: ["host.docker.internal:8080"] + metrics_path: "/metrics" +``` + +Quick run commands: + +```bash +# Start Prometheus +docker run --rm -d --name prom \ + -p 9090:9090 \ + -v "$(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml:ro" \ + prom/prometheus + +# Start Grafana +docker run --rm -d --name grafana \ + -p 3000:3000 \ + grafana/grafana +``` + +Suggested first panel query (requests/sec by route): + +```promql +sum(rate(secrets_http_requests_total[5m])) by (method, path) +``` + +## Metrics Endpoint + +The metrics are exposed at the `/metrics` endpoint in Prometheus exposition format: + +```bash +curl http://localhost:8080/metrics +``` + +**Key Points:** + +- **No Authentication Required** - The `/metrics` endpoint is public (standard Prometheus practice) +- **Prometheus Compatible** - Supports both text format and OpenMetrics format +- **Located Outside API Versioning** - Available at `/metrics`, not `/v1/metrics` + +## Available Metrics + +### Metrics Naming Contract + +These metrics are treated as stable for dashboard and alert compatibility. + +| Metric | Type | Labels | Stability | +| --- | --- | --- | --- | +| `{namespace}_http_requests_total` | Counter | `method`, `path`, `status_code` | Stable | +| `{namespace}_http_request_duration_seconds` | Histogram | `method`, `path`, `status_code` | Stable | +| `{namespace}_operations_total` | Counter | `domain`, `operation`, `status` | Stable | +| `{namespace}_operation_duration_seconds` | Histogram | `domain`, `operation`, `status` | Stable | + +Compatibility note: + +- Renaming metric names or labels is considered a breaking observability change +- Additive changes (new metrics/new label values) are generally non-breaking + +### Business Operation Metrics + +#### `{namespace}_operations_total` + +**Type:** Counter +**Description:** Total number of business operations executed +**Labels:** + +- `domain` - Business domain (auth, secrets, transit) +- `operation` - Operation name (e.g., client_create, secret_get, transit_encrypt) +- `status` - Operation result (success, error) + +**Example:** + +```prometheus +secrets_operations_total{domain="auth",operation="client_create",status="success"} 42 +secrets_operations_total{domain="secrets",operation="secret_get",status="success"} 1337 +secrets_operations_total{domain="transit",operation="transit_key_rotate",status="error"} 2 +``` + +#### `{namespace}_operation_duration_seconds` + +**Type:** Histogram +**Description:** Duration of business operations in seconds +**Labels:** + +- `domain` - Business domain (auth, secrets, transit) +- `operation` - Operation name +- `status` - Operation result (success, error) + +**Example:** + +```prometheus +secrets_operation_duration_seconds_bucket{domain="auth",operation="client_create",status="success",le="0.005"} 15 +secrets_operation_duration_seconds_bucket{domain="auth",operation="client_create",status="success",le="0.01"} 28 +secrets_operation_duration_seconds_sum{domain="auth",operation="client_create",status="success"} 1.25 +secrets_operation_duration_seconds_count{domain="auth",operation="client_create",status="success"} 42 +``` + +### HTTP Request Metrics + +#### `{namespace}_http_requests_total` + +**Type:** Counter +**Description:** Total number of HTTP requests received +**Labels:** + +- `method` - HTTP method (GET, POST, PUT, DELETE) +- `path` - Route pattern (e.g., /v1/secrets/*path) +- `status_code` - HTTP status code (200, 404, 500, etc.) + +**Example:** + +```prometheus +secrets_http_requests_total{method="GET",path="/v1/secrets/*path",status_code="200"} 1234 +secrets_http_requests_total{method="POST",path="/v1/clients",status_code="201"} 56 +secrets_http_requests_total{method="GET",path="/health",status_code="200"} 9999 +``` + +#### `{namespace}_http_request_duration_seconds` + +**Type:** Histogram +**Description:** Duration of HTTP requests in seconds +**Labels:** + +- `method` - HTTP method +- `path` - Route pattern +- `status_code` - HTTP status code + +**Example:** + +```prometheus +secrets_http_request_duration_seconds_bucket{method="GET",path="/v1/secrets/*path",status_code="200",le="0.005"} 800 +secrets_http_request_duration_seconds_bucket{method="GET",path="/v1/secrets/*path",status_code="200",le="0.01"} 1100 +secrets_http_request_duration_seconds_sum{method="GET",path="/v1/secrets/*path",status_code="200"} 6.789 +secrets_http_request_duration_seconds_count{method="GET",path="/v1/secrets/*path",status_code="200"} 1234 +``` + +## Business Domains and Operations + +### Auth Domain + +| Operation | Description | +|-----------|-------------| +| `client_create` | Create new API client | +| `client_get` | Retrieve client by ID | +| `client_update` | Update client configuration | +| `client_delete` | Delete API client | +| `client_list` | List all clients | +| `token_issue` | Issue authentication token | +| `token_authenticate` | Validate token | +| `audit_log_create` | Record audit log entry | +| `audit_log_list` | List audit logs | +| `audit_log_delete` | Delete audit logs older than retention | + +### Secrets Domain + +| Operation | Description | +|-----------|-------------| +| `secret_create` | Create or update secret | +| `secret_get` | Retrieve secret value | +| `secret_get_version` | Retrieve secret by explicit version | +| `secret_delete` | Delete secret | + +### Transit Domain + +| Operation | Description | +|-----------|-------------| +| `transit_key_create` | Create new transit key | +| `transit_key_rotate` | Rotate transit key to new version | +| `transit_key_delete` | Delete transit key | +| `transit_encrypt` | Encrypt data with transit key | +| `transit_decrypt` | Decrypt data with transit key | + +## Prometheus Configuration + +### Scrape Configuration + +Add the Secrets application to your `prometheus.yml`: + +```yaml +scrape_configs: + - job_name: 'secrets' + static_configs: + - targets: ['localhost:8080'] + metrics_path: '/metrics' + scrape_interval: 15s +``` + +### Example Queries + +**Total requests per second (rate over 5 minutes):** + +```promql +rate(secrets_http_requests_total[5m]) +``` + +**95th percentile request latency:** + +```promql +histogram_quantile(0.95, rate(secrets_http_request_duration_seconds_bucket[5m])) +``` + +**Error rate by domain:** + +```promql +rate(secrets_operations_total{status="error"}[5m]) / rate(secrets_operations_total[5m]) +``` + +**Slowest operations:** + +```promql +topk(5, rate(secrets_operation_duration_seconds_sum[5m]) / rate(secrets_operation_duration_seconds_count[5m])) +``` + +## Grafana Dashboard + +### Recommended Panels + +1. **Request Rate** - Line graph showing HTTP requests/sec +2. **Error Rate** - Percentage of failed operations by domain +3. **Latency Heatmap** - Distribution of request durations +4. **Operation Counts** - Table showing top operations by volume +5. **Status Code Distribution** - Pie chart of HTTP status codes + +### Example Panel Query (Request Rate) + +```promql +sum(rate(secrets_http_requests_total[5m])) by (method, path) +``` + +## Alerting + +### Recommended Alerts + +#### High Error Rate + +```yaml +- alert: HighErrorRate + expr: rate(secrets_operations_total{status="error"}[5m]) > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "High error rate detected" + description: "Error rate is {{ $value }} errors/sec" +``` + +#### High Latency + +```yaml +- alert: HighLatency + expr: histogram_quantile(0.95, rate(secrets_http_request_duration_seconds_bucket[5m])) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "High request latency" + description: "95th percentile latency is {{ $value }}s" +``` + +## Disabling Metrics + +To disable metrics collection, set `METRICS_ENABLED=false` in your environment: + +```bash +export METRICS_ENABLED=false +``` + +When disabled: + +- The `/metrics` endpoint is not registered (requests return 404 Not Found) +- No metrics are collected (zero overhead) +- HTTP metrics middleware is not applied +- Business metrics use a no-op implementation + +## Performance Considerations + +- **Low Overhead** - OpenTelemetry metrics have minimal performance impact +- **Cardinality Control** - Labels are carefully chosen to avoid high cardinality + - Paths use route patterns (e.g., `/v1/secrets/*path`) instead of actual values + - Operations are predefined, not dynamic + - Status values are limited to "success" and "error" +- **Memory Usage** - Metrics are stored in-memory until scraped by Prometheus + +## Troubleshooting + +### Metrics endpoint returns 404 + +- Check that `METRICS_ENABLED=true` in your environment +- Verify the server is running and accessible + +### Missing metrics + +- Ensure Prometheus is scraping the `/metrics` endpoint +- Check that operations are actually being executed +- Verify the namespace matches your configuration (`METRICS_NAMESPACE`) + +### High memory usage + +- Review Prometheus scrape interval (recommend 15-30 seconds) +- Check for high cardinality labels (should not occur with proper configuration) +- Verify metrics are being scraped regularly + +## See Also + +- [Production Deployment](production.md) +- [Failure Playbooks](failure-playbooks.md) +- [OpenTelemetry Documentation](https://opentelemetry.io/docs/) +- [Prometheus Documentation](https://prometheus.io/docs/) diff --git a/docs/operations/production.md b/docs/operations/production.md index 519e115..8cb0df2 100644 --- a/docs/operations/production.md +++ b/docs/operations/production.md @@ -1,6 +1,6 @@ # ๐Ÿญ Production Deployment Guide -> Last updated: 2026-02-14 +> Last updated: 2026-02-16 This guide covers baseline production hardening and operations for Secrets. @@ -97,6 +97,14 @@ Multi-node: - Alert on repeated denied authorization attempts from same client/IP - Track API latency and error rates by endpoint - Correlate request failures using `request_id` +- Scrape and alert on `secrets_http_requests_total`, `secrets_http_request_duration_seconds`, and `secrets_operations_total` + +Secure `/metrics` in production: + +1. Keep `/metrics` reachable only from internal monitoring networks +2. Restrict source IP ranges at load balancer or reverse proxy +3. If needed, add proxy-level auth in front of `/metrics` +4. Do not expose `/metrics` on public internet-facing routes SLO examples (starting point): @@ -130,6 +138,7 @@ SLO examples (starting point): ## See also - [Key management operations](key-management.md) +- [Monitoring](monitoring.md) - [Environment variables](../configuration/environment-variables.md) - [Security model](../concepts/security-model.md) - [Troubleshooting](../getting-started/troubleshooting.md) diff --git a/docs/releases/v0.3.0.md b/docs/releases/v0.3.0.md new file mode 100644 index 0000000..bb76483 --- /dev/null +++ b/docs/releases/v0.3.0.md @@ -0,0 +1,57 @@ +# ๐Ÿš€ Secrets v0.3.0 Release Notes + +> Release date: 2026-02-16 + +This release adds metrics instrumentation and Prometheus-compatible monitoring support. + +## Highlights + +- Added OpenTelemetry metrics provider with Prometheus exporter +- Added optional `/metrics` endpoint for Prometheus scraping +- Added HTTP metrics middleware for request counts and latency histograms +- Added business operation metrics across auth, secrets, and transit use cases +- Added metrics configuration via `METRICS_ENABLED` and `METRICS_NAMESPACE` + +## Metrics and Monitoring + +New metric families: + +- `{namespace}_http_requests_total` +- `{namespace}_http_request_duration_seconds` +- `{namespace}_operations_total` +- `{namespace}_operation_duration_seconds` + +Runtime behavior: + +- When `METRICS_ENABLED=true` (default), the server exposes `GET /metrics` +- When `METRICS_ENABLED=false`, metrics middleware and `/metrics` are not registered +- `METRICS_NAMESPACE` (default `secrets`) prefixes metric names + +## Runtime and Compatibility + +- API baseline remains v1 (`/v1/*`) +- Metrics endpoint is outside API versioning (`/metrics`) +- Local development targets: Linux and macOS +- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` +- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` + +## Upgrade Notes + +- Non-breaking addition: observability and metrics instrumentation +- Existing API paths and behavior remain compatible under API v1 documentation +- Update your environment configuration if you want custom metric namespace values + +Example: + +```bash +export METRICS_ENABLED=true +export METRICS_NAMESPACE=secrets +curl http://localhost:8080/metrics +``` + +## See also + +- [Monitoring operations guide](../operations/monitoring.md) +- [Environment variables](../configuration/environment-variables.md) +- [Production operations](../operations/production.md) +- [API compatibility policy](../api/versioning-policy.md) diff --git a/go.mod b/go.mod index 43a19d3..badb122 100644 --- a/go.mod +++ b/go.mod @@ -13,20 +13,29 @@ require ( github.com/jellydator/validation v1.2.0 github.com/joho/godotenv v1.5.1 github.com/lib/pq v1.11.2 + github.com/prometheus/client_golang v1.23.2 github.com/stretchr/testify v1.11.1 github.com/urfave/cli/v3 v3.6.2 + go.opentelemetry.io/otel v1.40.0 + go.opentelemetry.io/otel/exporters/prometheus v0.62.0 + go.opentelemetry.io/otel/metric v1.40.0 + go.opentelemetry.io/otel/sdk/metric v1.40.0 golang.org/x/crypto v0.48.0 ) require ( filippo.io/edwards25519 v1.1.0 // indirect + github.com/beorn7/perks v1.0.1 // indirect github.com/bytedance/sonic v1.14.0 // indirect github.com/bytedance/sonic/loader v0.3.0 // indirect github.com/ccoveille/go-safecast/v2 v2.0.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cloudwego/base64x v0.1.6 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/gabriel-vasile/mimetype v1.4.8 // indirect github.com/gin-contrib/sse v1.1.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-playground/validator/v10 v10.27.0 // indirect @@ -38,14 +47,23 @@ require ( github.com/mattn/go-isatty v0.0.20 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.67.5 // indirect + github.com/prometheus/otlptranslator v1.0.0 // indirect + github.com/prometheus/procfs v0.19.2 // indirect github.com/quic-go/qpack v0.5.1 // indirect github.com/quic-go/quic-go v0.54.0 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.3.0 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/otel/sdk v1.40.0 // indirect + go.opentelemetry.io/otel/trace v1.40.0 // indirect go.uber.org/mock v0.5.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect golang.org/x/arch v0.20.0 // indirect golang.org/x/mod v0.32.0 // indirect golang.org/x/net v0.49.0 // indirect @@ -53,6 +71,6 @@ require ( golang.org/x/sys v0.41.0 // indirect golang.org/x/text v0.34.0 // indirect golang.org/x/tools v0.41.0 // indirect - google.golang.org/protobuf v1.36.9 // indirect + google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index ae80a87..4ab26cd 100644 --- a/go.sum +++ b/go.sum @@ -10,12 +10,16 @@ github.com/allisson/go-pwdhash v0.3.1 h1:UzR/0V77E6l63fV6EuAUj0nj1S2jdGADzgoO7UB github.com/allisson/go-pwdhash v0.3.1/go.mod h1:qMlMlCyJ2zwSV8Df406IKgY4VC/39FpiaLamOmZezYU= github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d h1:Byv0BzEl3/e6D5CLfI0j/7hiIEtvGVFPCZ7Ei2oq8iQ= github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bytedance/sonic v1.14.0 h1:/OfKt8HFw0kh2rj8N0F6C/qPGRESq0BbaNZgcNXXzQQ= github.com/bytedance/sonic v1.14.0/go.mod h1:WoEbx8WTcFJfzCe0hbmyTGrfjt8PzNEBdxlNUO24NhA= github.com/bytedance/sonic/loader v0.3.0 h1:dskwH8edlzNMctoruo8FPTJDF3vLtDT0sXZwvZJyqeA= github.com/bytedance/sonic/loader v0.3.0/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= github.com/ccoveille/go-safecast/v2 v2.0.0 h1:+5eyITXAUj3wMjad6cRVJKGnC7vDS55zk0INzJagub0= github.com/ccoveille/go-safecast/v2 v2.0.0/go.mod h1:JIYA4CAR33blIDuE6fSwCp2sz1oOBahXnvmdBhOAABs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cloudwego/base64x v0.1.6 h1:t11wG9AECkCDk5fMSoxmufanudBtJ+/HemLstXDLI2M= github.com/cloudwego/base64x v0.1.6/go.mod h1:OFcloc187FXDaYHvrNIjxSe8ncn0OOM8gEHfghB2IPU= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= @@ -46,6 +50,7 @@ github.com/gin-contrib/sse v1.1.0 h1:n0w2GMuUpWDVp7qSpvze6fAu9iRxJY4Hmj6AmBOU05w github.com/gin-contrib/sse v1.1.0/go.mod h1:hxRZ5gVpWMT7Z0B0gSNYqqsSCNIJMjzvm6fqCz9vjwM= github.com/gin-gonic/gin v1.11.0 h1:OW/6PLjyusp2PPXtyxKHU0RbX6I/l28FTdDlae5ueWk= github.com/gin-gonic/gin v1.11.0/go.mod h1:+iq/FyxlGzII0KHiBGjuNn4UNENUlKbGlNmc+W50Dls= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= @@ -79,8 +84,16 @@ github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y= github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= github.com/lib/pq v1.11.2 h1:x6gxUeu39V0BHZiugWe8LXZYZ+Utk7hSJGThs8sdzfs= @@ -98,6 +111,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= @@ -109,10 +124,22 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/otlptranslator v1.0.0 h1:s0LJW/iN9dkIH+EnhiD3BlkkP5QVIUVEoIwkU+A6qos= +github.com/prometheus/otlptranslator v1.0.0/go.mod h1:vRYWnXvI6aWGpsdY/mOT/cbeVRBlPWtBNDb7kGR3uKM= +github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= +github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= github.com/quic-go/qpack v0.5.1 h1:giqksBPnT/HDtZ6VhtFKgoLOWmlyo9Ei6u9PqzIMbhI= github.com/quic-go/qpack v0.5.1/go.mod h1:+PC4XFrEskIVkcLzpEkbLqq1uCoxPhQuvK5rH1ZgaEg= github.com/quic-go/quic-go v0.54.0 h1:6s1YB9QotYI6Ospeiguknbp2Znb/jZYjZLRXn9kMQBg= github.com/quic-go/quic-go v0.54.0/go.mod h1:e68ZEaCdyviluZmy44P6Iey98v/Wfz6HCjQEm+l8zTY= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -131,18 +158,28 @@ github.com/ugorji/go/codec v1.3.0 h1:Qd2W2sQawAfG8XSvzwhBeoGq71zXOC/Q1E9y/wUcsUA github.com/ugorji/go/codec v1.3.0/go.mod h1:pRBVtBSKl77K30Bv8R2P+cLSGaTtex6fsA2Wjqmfxj4= github.com/urfave/cli/v3 v3.6.2 h1:lQuqiPrZ1cIz8hz+HcrG0TNZFxU70dPZ3Yl+pSrH9A8= github.com/urfave/cli/v3 v3.6.2/go.mod h1:ysVLtOEmg2tOy6PknnYVhDoouyC/6N42TMeoMzskhso= -go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= -go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= -go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= -go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms= +go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g= +go.opentelemetry.io/otel/exporters/prometheus v0.62.0 h1:krvC4JMfIOVdEuNPTtQ0ZjCiXrybhv+uOHMfHRmnvVo= +go.opentelemetry.io/otel/exporters/prometheus v0.62.0/go.mod h1:fgOE6FM/swEnsVQCqCnbOfRV4tOnWPg7bVeo4izBuhQ= +go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g= +go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc= +go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= +go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= +go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw= +go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= +go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw= +go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/mock v0.5.0 h1:KAMbZvZPyBPWgD14IrIQ38QCyjwpvVVV6K/bHl1IwQU= go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= golang.org/x/arch v0.20.0 h1:dx1zTU0MAE98U+TQ8BLl7XsJbgze2WnNKF/8tGp/Q6c= golang.org/x/arch v0.20.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk= golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= @@ -160,10 +197,11 @@ golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= -google.golang.org/protobuf v1.36.9 h1:w2gp2mA27hUeUzj9Ex9FBjsBm40zfaDtEWow293U7Iw= -google.golang.org/protobuf v1.36.9/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/internal/app/di.go b/internal/app/di.go index dd35533..e4f05d2 100644 --- a/internal/app/di.go +++ b/internal/app/di.go @@ -20,6 +20,7 @@ import ( cryptoUseCase "github.com/allisson/secrets/internal/crypto/usecase" "github.com/allisson/secrets/internal/database" "github.com/allisson/secrets/internal/http" + "github.com/allisson/secrets/internal/metrics" secretsHTTP "github.com/allisson/secrets/internal/secrets/http" secretsRepository "github.com/allisson/secrets/internal/secrets/repository" secretsUseCase "github.com/allisson/secrets/internal/secrets/usecase" @@ -41,6 +42,10 @@ type Container struct { // Managers txManager database.TxManager + // Metrics + metricsProvider *metrics.Provider + businessMetrics metrics.BusinessMetrics + // Services aeadManager cryptoService.AEADManager keyManager cryptoService.KeyManager @@ -82,6 +87,8 @@ type Container struct { dbInit sync.Once masterKeyChainInit sync.Once txManagerInit sync.Once + metricsProviderInit sync.Once + businessMetricsInit sync.Once aeadManagerInit sync.Once keyManagerInit sync.Once secretServiceInit sync.Once @@ -185,6 +192,42 @@ func (c *Container) TxManager() (database.TxManager, error) { return c.txManager, nil } +// MetricsProvider returns the metrics provider for Prometheus export. +func (c *Container) MetricsProvider() (*metrics.Provider, error) { + var err error + c.metricsProviderInit.Do(func() { + c.metricsProvider, err = c.initMetricsProvider() + if err != nil { + c.initErrors["metricsProvider"] = err + } + }) + if err != nil { + return nil, err + } + if storedErr, exists := c.initErrors["metricsProvider"]; exists { + return nil, storedErr + } + return c.metricsProvider, nil +} + +// BusinessMetrics returns the business metrics recorder. +func (c *Container) BusinessMetrics() (metrics.BusinessMetrics, error) { + var err error + c.businessMetricsInit.Do(func() { + c.businessMetrics, err = c.initBusinessMetrics() + if err != nil { + c.initErrors["businessMetrics"] = err + } + }) + if err != nil { + return nil, err + } + if storedErr, exists := c.initErrors["businessMetrics"]; exists { + return nil, storedErr + } + return c.businessMetrics, nil +} + // AEADManager returns the AEAD manager service. func (c *Container) AEADManager() cryptoService.AEADManager { c.aeadManagerInit.Do(func() { @@ -519,6 +562,13 @@ func (c *Container) Shutdown(ctx context.Context) error { } } + // Shutdown metrics provider if initialized + if c.metricsProvider != nil { + if err := c.metricsProvider.Shutdown(ctx); err != nil { + shutdownErrors = append(shutdownErrors, fmt.Errorf("metrics provider shutdown: %w", err)) + } + } + // Close master key chain if initialized if c.masterKeyChain != nil { c.masterKeyChain.Close() @@ -595,6 +645,40 @@ func (c *Container) initTxManager() (database.TxManager, error) { return database.NewTxManager(db), nil } +// initMetricsProvider creates the metrics provider if metrics are enabled. +func (c *Container) initMetricsProvider() (*metrics.Provider, error) { + if !c.config.MetricsEnabled { + return nil, nil + } + + provider, err := metrics.NewProvider(c.config.MetricsNamespace) + if err != nil { + return nil, fmt.Errorf("failed to create metrics provider: %w", err) + } + return provider, nil +} + +// initBusinessMetrics creates the business metrics recorder if metrics are enabled. +func (c *Container) initBusinessMetrics() (metrics.BusinessMetrics, error) { + if !c.config.MetricsEnabled { + return metrics.NewNoOpBusinessMetrics(), nil + } + + provider, err := c.MetricsProvider() + if err != nil { + return nil, fmt.Errorf("failed to get metrics provider: %w", err) + } + if provider == nil { + return metrics.NewNoOpBusinessMetrics(), nil + } + + businessMetrics, err := metrics.NewBusinessMetrics(provider.MeterProvider(), c.config.MetricsNamespace) + if err != nil { + return nil, fmt.Errorf("failed to create business metrics: %w", err) + } + return businessMetrics, nil +} + // initHTTPServer creates the HTTP server with all its dependencies. func (c *Container) initHTTPServer() (*http.Server, error) { logger := c.Logger() @@ -648,6 +732,12 @@ func (c *Container) initHTTPServer() (*http.Server, error) { return nil, fmt.Errorf("failed to get audit log use case: %w", err) } + // Get metrics provider (may be nil if metrics are disabled) + metricsProvider, err := c.MetricsProvider() + if err != nil { + return nil, fmt.Errorf("failed to get metrics provider: %w", err) + } + // Setup router with dependencies server.SetupRouter( clientHandler, @@ -659,6 +749,8 @@ func (c *Container) initHTTPServer() (*http.Server, error) { tokenUseCase, tokenService, auditLogUseCase, + metricsProvider, + c.config.MetricsNamespace, ) return server, nil @@ -745,7 +837,18 @@ func (c *Container) initClientUseCase() (authUseCase.ClientUseCase, error) { secretService := c.SecretService() - return authUseCase.NewClientUseCase(txManager, clientRepository, secretService), nil + baseUseCase := authUseCase.NewClientUseCase(txManager, clientRepository, secretService) + + // Wrap with metrics if enabled + if c.config.MetricsEnabled { + businessMetrics, err := c.BusinessMetrics() + if err != nil { + return nil, fmt.Errorf("failed to get business metrics for client use case: %w", err) + } + return authUseCase.NewClientUseCaseWithMetrics(baseUseCase, businessMetrics), nil + } + + return baseUseCase, nil } // initTokenService creates the token service for authentication. @@ -802,13 +905,24 @@ func (c *Container) initTokenUseCase() (authUseCase.TokenUseCase, error) { secretService := c.SecretService() tokenService := c.TokenService() - return authUseCase.NewTokenUseCase( + baseUseCase := authUseCase.NewTokenUseCase( c.config, clientRepository, tokenRepository, secretService, tokenService, - ), nil + ) + + // Wrap with metrics if enabled + if c.config.MetricsEnabled { + businessMetrics, err := c.BusinessMetrics() + if err != nil { + return nil, fmt.Errorf("failed to get business metrics for token use case: %w", err) + } + return authUseCase.NewTokenUseCaseWithMetrics(baseUseCase, businessMetrics), nil + } + + return baseUseCase, nil } // initAuditLogUseCase creates the audit log use case with all its dependencies. @@ -818,7 +932,18 @@ func (c *Container) initAuditLogUseCase() (authUseCase.AuditLogUseCase, error) { return nil, fmt.Errorf("failed to get audit log repository for audit log use case: %w", err) } - return authUseCase.NewAuditLogUseCase(auditLogRepository), nil + baseUseCase := authUseCase.NewAuditLogUseCase(auditLogRepository) + + // Wrap with metrics if enabled + if c.config.MetricsEnabled { + businessMetrics, err := c.BusinessMetrics() + if err != nil { + return nil, fmt.Errorf("failed to get business metrics for audit log use case: %w", err) + } + return authUseCase.NewAuditLogUseCaseWithMetrics(baseUseCase, businessMetrics), nil + } + + return baseUseCase, nil } // initClientHandler creates the client HTTP handler with all its dependencies. @@ -921,7 +1046,7 @@ func (c *Container) initSecretUseCase() (secretsUseCase.SecretUseCase, error) { aeadManager := c.AEADManager() keyManager := c.KeyManager() - return secretsUseCase.NewSecretUseCase( + baseUseCase := secretsUseCase.NewSecretUseCase( txManager, dekRepository, secretRepository, @@ -929,7 +1054,18 @@ func (c *Container) initSecretUseCase() (secretsUseCase.SecretUseCase, error) { aeadManager, keyManager, cryptoDomain.AESGCM, - ), nil + ) + + // Wrap with metrics if enabled + if c.config.MetricsEnabled { + businessMetrics, err := c.BusinessMetrics() + if err != nil { + return nil, fmt.Errorf("failed to get business metrics for secret use case: %w", err) + } + return secretsUseCase.NewSecretUseCaseWithMetrics(baseUseCase, businessMetrics), nil + } + + return baseUseCase, nil } // initSecretHandler creates the secret HTTP handler with all its dependencies. @@ -1119,14 +1255,25 @@ func (c *Container) initTransitKeyUseCase() (transitUseCase.TransitKeyUseCase, e keyManager := c.KeyManager() aeadManager := c.AEADManager() - return transitUseCase.NewTransitKeyUseCase( + baseUseCase := transitUseCase.NewTransitKeyUseCase( txManager, transitKeyRepository, dekRepository, keyManager, aeadManager, kekChain, - ), nil + ) + + // Wrap with metrics if enabled + if c.config.MetricsEnabled { + businessMetrics, err := c.BusinessMetrics() + if err != nil { + return nil, fmt.Errorf("failed to get business metrics for transit key use case: %w", err) + } + return transitUseCase.NewTransitKeyUseCaseWithMetrics(baseUseCase, businessMetrics), nil + } + + return baseUseCase, nil } // initTransitKeyHandler creates the transit key HTTP handler with all its dependencies. diff --git a/internal/auth/usecase/metrics_decorator.go b/internal/auth/usecase/metrics_decorator.go new file mode 100644 index 0000000..391e331 --- /dev/null +++ b/internal/auth/usecase/metrics_decorator.go @@ -0,0 +1,244 @@ +package usecase + +import ( + "context" + "time" + + "github.com/google/uuid" + + authDomain "github.com/allisson/secrets/internal/auth/domain" + "github.com/allisson/secrets/internal/metrics" +) + +// clientUseCaseWithMetrics decorates ClientUseCase with metrics instrumentation. +type clientUseCaseWithMetrics struct { + next ClientUseCase + metrics metrics.BusinessMetrics +} + +// NewClientUseCaseWithMetrics wraps a ClientUseCase with metrics recording. +func NewClientUseCaseWithMetrics(useCase ClientUseCase, m metrics.BusinessMetrics) ClientUseCase { + return &clientUseCaseWithMetrics{ + next: useCase, + metrics: m, + } +} + +// Create records metrics for client creation operations. +func (c *clientUseCaseWithMetrics) Create( + ctx context.Context, + createClientInput *authDomain.CreateClientInput, +) (*authDomain.CreateClientOutput, error) { + start := time.Now() + output, err := c.next.Create(ctx, createClientInput) + + status := "success" + if err != nil { + status = "error" + } + + c.metrics.RecordOperation(ctx, "auth", "client_create", status) + c.metrics.RecordDuration(ctx, "auth", "client_create", time.Since(start), status) + + return output, err +} + +// Update records metrics for client update operations. +func (c *clientUseCaseWithMetrics) Update( + ctx context.Context, + clientID uuid.UUID, + updateClientInput *authDomain.UpdateClientInput, +) error { + start := time.Now() + err := c.next.Update(ctx, clientID, updateClientInput) + + status := "success" + if err != nil { + status = "error" + } + + c.metrics.RecordOperation(ctx, "auth", "client_update", status) + c.metrics.RecordDuration(ctx, "auth", "client_update", time.Since(start), status) + + return err +} + +// Get records metrics for client retrieval operations. +func (c *clientUseCaseWithMetrics) Get(ctx context.Context, clientID uuid.UUID) (*authDomain.Client, error) { + start := time.Now() + client, err := c.next.Get(ctx, clientID) + + status := "success" + if err != nil { + status = "error" + } + + c.metrics.RecordOperation(ctx, "auth", "client_get", status) + c.metrics.RecordDuration(ctx, "auth", "client_get", time.Since(start), status) + + return client, err +} + +// List records metrics for client list operations. +func (c *clientUseCaseWithMetrics) List( + ctx context.Context, + offset, limit int, +) ([]*authDomain.Client, error) { + start := time.Now() + clients, err := c.next.List(ctx, offset, limit) + + status := "success" + if err != nil { + status = "error" + } + + c.metrics.RecordOperation(ctx, "auth", "client_list", status) + c.metrics.RecordDuration(ctx, "auth", "client_list", time.Since(start), status) + + return clients, err +} + +// Delete records metrics for client deletion operations. +func (c *clientUseCaseWithMetrics) Delete(ctx context.Context, clientID uuid.UUID) error { + start := time.Now() + err := c.next.Delete(ctx, clientID) + + status := "success" + if err != nil { + status = "error" + } + + c.metrics.RecordOperation(ctx, "auth", "client_delete", status) + c.metrics.RecordDuration(ctx, "auth", "client_delete", time.Since(start), status) + + return err +} + +// tokenUseCaseWithMetrics decorates TokenUseCase with metrics instrumentation. +type tokenUseCaseWithMetrics struct { + next TokenUseCase + metrics metrics.BusinessMetrics +} + +// NewTokenUseCaseWithMetrics wraps a TokenUseCase with metrics recording. +func NewTokenUseCaseWithMetrics(useCase TokenUseCase, m metrics.BusinessMetrics) TokenUseCase { + return &tokenUseCaseWithMetrics{ + next: useCase, + metrics: m, + } +} + +// Issue records metrics for token issuance operations. +func (t *tokenUseCaseWithMetrics) Issue( + ctx context.Context, + issueTokenInput *authDomain.IssueTokenInput, +) (*authDomain.IssueTokenOutput, error) { + start := time.Now() + output, err := t.next.Issue(ctx, issueTokenInput) + + status := "success" + if err != nil { + status = "error" + } + + t.metrics.RecordOperation(ctx, "auth", "token_issue", status) + t.metrics.RecordDuration(ctx, "auth", "token_issue", time.Since(start), status) + + return output, err +} + +// Authenticate records metrics for token authentication operations. +func (t *tokenUseCaseWithMetrics) Authenticate( + ctx context.Context, + tokenHash string, +) (*authDomain.Client, error) { + start := time.Now() + client, err := t.next.Authenticate(ctx, tokenHash) + + status := "success" + if err != nil { + status = "error" + } + + t.metrics.RecordOperation(ctx, "auth", "token_authenticate", status) + t.metrics.RecordDuration(ctx, "auth", "token_authenticate", time.Since(start), status) + + return client, err +} + +// auditLogUseCaseWithMetrics decorates AuditLogUseCase with metrics instrumentation. +type auditLogUseCaseWithMetrics struct { + next AuditLogUseCase + metrics metrics.BusinessMetrics +} + +// NewAuditLogUseCaseWithMetrics wraps an AuditLogUseCase with metrics recording. +func NewAuditLogUseCaseWithMetrics(useCase AuditLogUseCase, m metrics.BusinessMetrics) AuditLogUseCase { + return &auditLogUseCaseWithMetrics{ + next: useCase, + metrics: m, + } +} + +// Create records metrics for audit log creation operations. +func (a *auditLogUseCaseWithMetrics) Create( + ctx context.Context, + requestID uuid.UUID, + clientID uuid.UUID, + capability authDomain.Capability, + path string, + metadata map[string]any, +) error { + start := time.Now() + err := a.next.Create(ctx, requestID, clientID, capability, path, metadata) + + status := "success" + if err != nil { + status = "error" + } + + a.metrics.RecordOperation(ctx, "auth", "audit_log_create", status) + a.metrics.RecordDuration(ctx, "auth", "audit_log_create", time.Since(start), status) + + return err +} + +// List records metrics for audit log list operations. +func (a *auditLogUseCaseWithMetrics) List( + ctx context.Context, + offset, limit int, + createdAtFrom, createdAtTo *time.Time, +) ([]*authDomain.AuditLog, error) { + start := time.Now() + logs, err := a.next.List(ctx, offset, limit, createdAtFrom, createdAtTo) + + status := "success" + if err != nil { + status = "error" + } + + a.metrics.RecordOperation(ctx, "auth", "audit_log_list", status) + a.metrics.RecordDuration(ctx, "auth", "audit_log_list", time.Since(start), status) + + return logs, err +} + +// DeleteOlderThan records metrics for audit log deletion operations. +func (a *auditLogUseCaseWithMetrics) DeleteOlderThan( + ctx context.Context, + days int, + dryRun bool, +) (int64, error) { + start := time.Now() + count, err := a.next.DeleteOlderThan(ctx, days, dryRun) + + status := "success" + if err != nil { + status = "error" + } + + a.metrics.RecordOperation(ctx, "auth", "audit_log_delete", status) + a.metrics.RecordDuration(ctx, "auth", "audit_log_delete", time.Since(start), status) + + return count, err +} diff --git a/internal/config/config.go b/internal/config/config.go index 7ee63a6..d259a44 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -28,6 +28,10 @@ type Config struct { // Auth AuthTokenExpiration time.Duration + + // Metrics + MetricsEnabled bool + MetricsNamespace string } // Load loads configuration from environment variables and .env file. @@ -55,6 +59,10 @@ func Load() *Config { // Auth AuthTokenExpiration: env.GetDuration("AUTH_TOKEN_EXPIRATION_SECONDS", 86400, time.Second), + + // Metrics + MetricsEnabled: env.GetBool("METRICS_ENABLED", true), + MetricsNamespace: env.GetString("METRICS_NAMESPACE", "secrets"), } } diff --git a/internal/http/http_test.go b/internal/http/http_test.go index ea7f255..452989d 100644 --- a/internal/http/http_test.go +++ b/internal/http/http_test.go @@ -17,6 +17,8 @@ import ( "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/allisson/secrets/internal/metrics" ) // TestMain sets Gin to test mode for all tests in this package. @@ -246,44 +248,86 @@ func TestRequestIDMiddleware_HeaderPresent(t *testing.T) { _ = logger // Prevent unused variable error } -// TestRequestIDMiddleware_AccessibleInHandler verifies request ID can be retrieved in handlers. -func TestRequestIDMiddleware_AccessibleInHandler(t *testing.T) { - logger := slog.New(slog.NewTextHandler(io.Discard, nil)) +// TestRouter_MetricsEndpoint tests the /metrics endpoint when metrics are enabled. +func TestRouter_MetricsEndpoint(t *testing.T) { + server := createTestServer() - gin.SetMode(gin.TestMode) + // Create metrics provider + provider, err := metrics.NewProvider("test_app") + require.NoError(t, err) + defer func() { + assert.NoError(t, provider.Shutdown(context.Background())) + }() + + // Create router with metrics endpoint router := gin.New() + router.Use(gin.Recovery()) router.Use(requestid.New(requestid.WithGenerator(func() string { return uuid.Must(uuid.NewV7()).String() }))) + router.Use(CustomLoggerMiddleware(server.logger)) + + // Add metrics middleware + router.Use(metrics.HTTPMetricsMiddleware(provider.MeterProvider(), "test_app")) - var capturedRequestID string + // Add metrics endpoint + router.GET("/metrics", gin.WrapH(provider.Handler())) + + // Add a test endpoint to generate metrics router.GET("/test", func(c *gin.Context) { - // Capture request ID from context - capturedRequestID = requestid.Get(c) - c.JSON(http.StatusOK, gin.H{"request_id": capturedRequestID}) + c.JSON(http.StatusOK, gin.H{"message": "test"}) }) + // Generate some metrics by calling the test endpoint + for i := 0; i < 5; i++ { + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/test", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + } + + // Now request the metrics endpoint w := httptest.NewRecorder() - req := httptest.NewRequest(http.MethodGet, "/test", nil) + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) router.ServeHTTP(w, req) assert.Equal(t, http.StatusOK, w.Code) - // Verify request ID from handler matches response header - headerRequestID := w.Header().Get("X-Request-Id") - assert.Equal(t, headerRequestID, capturedRequestID, - "Request ID from handler should match X-Request-Id header") + // Verify response is in Prometheus format (contains metric lines) + body := w.Body.String() + assert.NotEmpty(t, body, "metrics response should not be empty") + + // Check for expected metric names (OpenTelemetry automatically exposes these) + assert.Contains(t, body, "test_app_http_requests_total", "should contain HTTP requests counter metric") + assert.Contains( + t, + body, + "test_app_http_request_duration_seconds", + "should contain HTTP duration histogram metric", + ) + + // Verify Content-Type header + contentType := w.Header().Get("Content-Type") + assert.Contains(t, contentType, "text/plain", "metrics endpoint should return text/plain content type") +} - // Verify request ID is included in JSON response - var response map[string]string - err := json.Unmarshal(w.Body.Bytes(), &response) +// TestRouter_MetricsEndpoint_NoAuth tests that /metrics endpoint does not require authentication. +func TestRouter_MetricsEndpoint_NoAuth(t *testing.T) { + // Create metrics provider + provider, err := metrics.NewProvider("test_app") require.NoError(t, err) - assert.Equal(t, capturedRequestID, response["request_id"]) + defer func() { + assert.NoError(t, provider.Shutdown(context.Background())) + }() - // Verify it's a valid UUIDv7 - parsedUUID, err := uuid.Parse(capturedRequestID) - require.NoError(t, err, "Request ID should be a valid UUID") - assert.Equal(t, uuid.Version(7), parsedUUID.Version(), "Request ID should be UUIDv7") + // Create router with metrics endpoint (no auth middleware) + router := gin.New() + router.GET("/metrics", gin.WrapH(provider.Handler())) - _ = logger // Prevent unused variable error + // Request without authentication should succeed + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + router.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) } diff --git a/internal/http/server.go b/internal/http/server.go index 4e4f5ed..7408511 100644 --- a/internal/http/server.go +++ b/internal/http/server.go @@ -23,6 +23,7 @@ import ( authHTTP "github.com/allisson/secrets/internal/auth/http" authService "github.com/allisson/secrets/internal/auth/service" authUseCase "github.com/allisson/secrets/internal/auth/usecase" + "github.com/allisson/secrets/internal/metrics" secretsHTTP "github.com/allisson/secrets/internal/secrets/http" transitHTTP "github.com/allisson/secrets/internal/transit/http" ) @@ -63,6 +64,8 @@ func (s *Server) SetupRouter( tokenUseCase authUseCase.TokenUseCase, tokenService authService.TokenService, auditLogUseCase authUseCase.AuditLogUseCase, + metricsProvider *metrics.Provider, + metricsNamespace string, ) { // Create Gin engine without default middleware router := gin.New() @@ -74,6 +77,16 @@ func (s *Server) SetupRouter( }))) // Request ID with UUIDv7 router.Use(CustomLoggerMiddleware(s.logger)) // Custom slog logger + // Add HTTP metrics middleware if metrics are enabled + if metricsProvider != nil { + router.Use(metrics.HTTPMetricsMiddleware(metricsProvider.MeterProvider(), metricsNamespace)) + } + + // Metrics endpoint (Prometheus scrape endpoint, no authentication required) + if metricsProvider != nil { + router.GET("/metrics", gin.WrapH(metricsProvider.Handler())) + } + // Health and readiness endpoints (outside API versioning) router.GET("/health", s.healthHandler) router.GET("/ready", s.readinessHandler) diff --git a/internal/metrics/business.go b/internal/metrics/business.go new file mode 100644 index 0000000..b57dbd5 --- /dev/null +++ b/internal/metrics/business.go @@ -0,0 +1,113 @@ +package metrics + +import ( + "context" + "fmt" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// BusinessMetrics defines the interface for recording business operation metrics. +// Implementations track operation counts and durations for observability across +// different business domains (auth, secrets, transit). +type BusinessMetrics interface { + // RecordOperation records a business operation with its status. + // Domain examples: "auth", "secrets", "transit" + // Operation examples: "client_create", "secret_get", "transit_encrypt" + // Status examples: "success", "error" + RecordOperation(ctx context.Context, domain, operation, status string) + + // RecordDuration records the duration of a business operation with its status. + // Duration is recorded in seconds as a histogram for percentile calculations. + RecordDuration(ctx context.Context, domain, operation string, duration time.Duration, status string) +} + +// businessMetrics implements BusinessMetrics using OpenTelemetry metrics. +type businessMetrics struct { + operationCounter metric.Int64Counter + durationHisto metric.Float64Histogram +} + +// NewBusinessMetrics creates a new BusinessMetrics implementation using the provided meter provider. +// The namespace parameter is used as a prefix for all metric names (e.g., "secrets"). +// Returns error if meters cannot be initialized. +func NewBusinessMetrics(meterProvider metric.MeterProvider, namespace string) (BusinessMetrics, error) { + meter := meterProvider.Meter(namespace) + + // Create counter for total operations + operationCounter, err := meter.Int64Counter( + fmt.Sprintf("%s_operations_total", namespace), + metric.WithDescription("Total number of business operations"), + metric.WithUnit("{operation}"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create operation counter: %w", err) + } + + // Create histogram for operation durations + durationHisto, err := meter.Float64Histogram( + fmt.Sprintf("%s_operation_duration_seconds", namespace), + metric.WithDescription("Duration of business operations in seconds"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create duration histogram: %w", err) + } + + return &businessMetrics{ + operationCounter: operationCounter, + durationHisto: durationHisto, + }, nil +} + +// RecordOperation increments the operation counter with domain, operation, and status labels. +func (b *businessMetrics) RecordOperation(ctx context.Context, domain, operation, status string) { + b.operationCounter.Add(ctx, 1, + metric.WithAttributes( + attribute.String("domain", domain), + attribute.String("operation", operation), + attribute.String("status", status), + ), + ) +} + +// RecordDuration records the operation duration in seconds with domain, operation, and status labels. +func (b *businessMetrics) RecordDuration( + ctx context.Context, + domain, operation string, + duration time.Duration, + status string, +) { + b.durationHisto.Record(ctx, duration.Seconds(), + metric.WithAttributes( + attribute.String("domain", domain), + attribute.String("operation", operation), + attribute.String("status", status), + ), + ) +} + +// NoOpBusinessMetrics is a no-op implementation of BusinessMetrics for when metrics are disabled. +type NoOpBusinessMetrics struct{} + +// NewNoOpBusinessMetrics creates a no-op BusinessMetrics implementation. +func NewNoOpBusinessMetrics() BusinessMetrics { + return &NoOpBusinessMetrics{} +} + +// RecordOperation does nothing when metrics are disabled. +func (n *NoOpBusinessMetrics) RecordOperation(ctx context.Context, domain, operation, status string) { + // No-op +} + +// RecordDuration does nothing when metrics are disabled. +func (n *NoOpBusinessMetrics) RecordDuration( + ctx context.Context, + domain, operation string, + duration time.Duration, + status string, +) { + // No-op +} diff --git a/internal/metrics/business_test.go b/internal/metrics/business_test.go new file mode 100644 index 0000000..d402bbb --- /dev/null +++ b/internal/metrics/business_test.go @@ -0,0 +1,128 @@ +package metrics + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewBusinessMetrics(t *testing.T) { + t.Run("Success_CreateBusinessMetrics", func(t *testing.T) { + provider, err := NewProvider("test_app") + require.NoError(t, err) + + businessMetrics, err := NewBusinessMetrics(provider.MeterProvider(), "test_app") + + require.NoError(t, err) + assert.NotNil(t, businessMetrics) + }) +} + +func TestBusinessMetrics_RecordOperation(t *testing.T) { + provider, err := NewProvider("test_app") + require.NoError(t, err) + + bm, err := NewBusinessMetrics(provider.MeterProvider(), "test_app") + require.NoError(t, err) + + t.Run("Success_RecordSuccessfulOperation", func(t *testing.T) { + // Should not panic + bm.RecordOperation(context.Background(), "auth", "create_client", "success") + }) + + t.Run("Success_RecordFailedOperation", func(t *testing.T) { + // Should not panic + bm.RecordOperation(context.Background(), "auth", "create_client", "error") + }) + + t.Run("Success_RecordMultipleDomains", func(t *testing.T) { + bm.RecordOperation(context.Background(), "auth", "create_client", "success") + bm.RecordOperation(context.Background(), "secrets", "encrypt", "success") + bm.RecordOperation(context.Background(), "transit", "rotate_key", "error") + }) +} + +func TestBusinessMetrics_RecordDuration(t *testing.T) { + provider, err := NewProvider("test_app") + require.NoError(t, err) + + bm, err := NewBusinessMetrics(provider.MeterProvider(), "test_app") + require.NoError(t, err) + + t.Run("Success_RecordSuccessfulDuration", func(t *testing.T) { + // Should not panic + bm.RecordDuration(context.Background(), "auth", "create_client", 123*time.Millisecond, "success") + }) + + t.Run("Success_RecordFailedDuration", func(t *testing.T) { + // Should not panic + bm.RecordDuration(context.Background(), "auth", "create_client", 456*time.Millisecond, "error") + }) + + t.Run("Success_RecordMultipleDomains", func(t *testing.T) { + bm.RecordDuration(context.Background(), "auth", "create_client", 100*time.Millisecond, "success") + bm.RecordDuration(context.Background(), "secrets", "encrypt", 200*time.Millisecond, "success") + bm.RecordDuration(context.Background(), "transit", "rotate_key", 300*time.Millisecond, "error") + }) +} + +func TestNewNoOpBusinessMetrics(t *testing.T) { + noOpMetrics := NewNoOpBusinessMetrics() + + assert.NotNil(t, noOpMetrics) + assert.IsType(t, &NoOpBusinessMetrics{}, noOpMetrics) + + t.Run("NoOp_RecordOperationDoesNotPanic", func(t *testing.T) { + // Should not panic or do anything + noOpMetrics.RecordOperation(context.Background(), "auth", "create_client", "success") + noOpMetrics.RecordOperation(context.Background(), "secrets", "encrypt", "error") + }) + + t.Run("NoOp_RecordDurationDoesNotPanic", func(t *testing.T) { + // Should not panic or do anything + noOpMetrics.RecordDuration( + context.Background(), + "auth", + "create_client", + 100*time.Millisecond, + "success", + ) + noOpMetrics.RecordDuration(context.Background(), "secrets", "encrypt", 200*time.Millisecond, "error") + }) +} + +func TestBusinessMetrics_Integration(t *testing.T) { + provider, err := NewProvider("integration_test") + require.NoError(t, err) + defer func() { + assert.NoError(t, provider.Shutdown(context.Background())) + }() + + bm, err := NewBusinessMetrics(provider.MeterProvider(), "integration_test") + require.NoError(t, err) + + // Record various operations + ctx := context.Background() + + // Record operation counts + bm.RecordOperation(ctx, "auth", "create_client", "success") + bm.RecordOperation(ctx, "auth", "create_client", "success") + bm.RecordOperation(ctx, "auth", "create_client", "error") + bm.RecordOperation(ctx, "secrets", "encrypt", "success") + bm.RecordOperation(ctx, "secrets", "decrypt", "success") + bm.RecordOperation(ctx, "transit", "rotate_key", "success") + + // Record operation durations + bm.RecordDuration(ctx, "auth", "create_client", 50*time.Millisecond, "success") + bm.RecordDuration(ctx, "auth", "create_client", 60*time.Millisecond, "success") + bm.RecordDuration(ctx, "auth", "create_client", 100*time.Millisecond, "error") + bm.RecordDuration(ctx, "secrets", "encrypt", 10*time.Millisecond, "success") + bm.RecordDuration(ctx, "secrets", "decrypt", 20*time.Millisecond, "success") + bm.RecordDuration(ctx, "transit", "rotate_key", 150*time.Millisecond, "success") + + // Metrics should be recorded without errors + // Actual metric values are tested through Prometheus scraping +} diff --git a/internal/metrics/http.go b/internal/metrics/http.go new file mode 100644 index 0000000..d4a14f4 --- /dev/null +++ b/internal/metrics/http.go @@ -0,0 +1,90 @@ +package metrics + +import ( + "fmt" + "strconv" + "time" + + "github.com/gin-gonic/gin" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// httpMetrics holds HTTP-specific metric instruments. +type httpMetrics struct { + requestCounter metric.Int64Counter + durationHisto metric.Float64Histogram +} + +// HTTPMetricsMiddleware returns a Gin middleware that records HTTP request metrics. +// Tracks total requests and request durations with method, path, and status_code labels. +// The path is sanitized to route patterns (e.g., /v1/secrets/:path) to prevent high cardinality. +func HTTPMetricsMiddleware(meterProvider metric.MeterProvider, namespace string) gin.HandlerFunc { + meter := meterProvider.Meter(namespace) + + // Create counter for total HTTP requests + requestCounter, err := meter.Int64Counter( + fmt.Sprintf("%s_http_requests_total", namespace), + metric.WithDescription("Total number of HTTP requests"), + metric.WithUnit("{request}"), + ) + if err != nil { + // If metric creation fails, return a no-op middleware + return func(c *gin.Context) { + c.Next() + } + } + + // Create histogram for HTTP request durations + durationHisto, err := meter.Float64Histogram( + fmt.Sprintf("%s_http_request_duration_seconds", namespace), + metric.WithDescription("HTTP request duration in seconds"), + metric.WithUnit("s"), + ) + if err != nil { + // If metric creation fails, return a no-op middleware + return func(c *gin.Context) { + c.Next() + } + } + + metrics := &httpMetrics{ + requestCounter: requestCounter, + durationHisto: durationHisto, + } + + return func(c *gin.Context) { + start := time.Now() + + // Process request + c.Next() + + // Record metrics after request completes + duration := time.Since(start) + method := c.Request.Method + path := sanitizePath(c.FullPath()) // Use route pattern, not actual path + statusCode := strconv.Itoa(c.Writer.Status()) + + attrs := []attribute.KeyValue{ + attribute.String("method", method), + attribute.String("path", path), + attribute.String("status_code", statusCode), + } + + // Record request count + metrics.requestCounter.Add(c.Request.Context(), 1, metric.WithAttributes(attrs...)) + + // Record request duration + metrics.durationHisto.Record(c.Request.Context(), duration.Seconds(), metric.WithAttributes(attrs...)) + } +} + +// sanitizePath converts actual request paths to route patterns for metrics. +// Returns the route pattern if available, otherwise returns the actual path. +// If path is empty (route not matched), returns "unknown". +func sanitizePath(fullPath string) string { + if fullPath == "" { + return "unknown" + } + return fullPath +} diff --git a/internal/metrics/http_test.go b/internal/metrics/http_test.go new file mode 100644 index 0000000..8edacbc --- /dev/null +++ b/internal/metrics/http_test.go @@ -0,0 +1,143 @@ +package metrics + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestHTTPMetricsMiddleware(t *testing.T) { + gin.SetMode(gin.TestMode) + + t.Run("Success_RecordHTTPMetrics", func(t *testing.T) { + provider, err := NewProvider("test_app") + require.NoError(t, err) + defer func() { + assert.NoError(t, provider.Shutdown(context.Background())) + }() + + middleware := HTTPMetricsMiddleware(provider.MeterProvider(), "test_app") + + router := gin.New() + router.Use(middleware) + router.GET("/test", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"message": "ok"}) + }) + + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/test", nil) + router.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + }) + + t.Run("Success_RecordMultipleRequests", func(t *testing.T) { + provider, err := NewProvider("test_app") + require.NoError(t, err) + defer func() { + assert.NoError(t, provider.Shutdown(context.Background())) + }() + + middleware := HTTPMetricsMiddleware(provider.MeterProvider(), "test_app") + + router := gin.New() + router.Use(middleware) + router.GET("/test", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"message": "ok"}) + }) + router.POST("/test", func(c *gin.Context) { + c.JSON(http.StatusCreated, gin.H{"message": "created"}) + }) + router.GET("/error", func(c *gin.Context) { + c.JSON(http.StatusInternalServerError, gin.H{"error": "error"}) + }) + + // Record multiple requests + for i := 0; i < 5; i++ { + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/test", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + } + + // Record POST request + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/test", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusCreated, w.Code) + + // Record error request + w = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodGet, "/error", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusInternalServerError, w.Code) + }) + + t.Run("Success_RecordWithPathParams", func(t *testing.T) { + provider, err := NewProvider("test_app") + require.NoError(t, err) + defer func() { + assert.NoError(t, provider.Shutdown(context.Background())) + }() + + middleware := HTTPMetricsMiddleware(provider.MeterProvider(), "test_app") + + router := gin.New() + router.Use(middleware) + router.GET("/users/:id", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"id": c.Param("id")}) + }) + + // Request with different path params should use route pattern + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/users/123", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + + w = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodGet, "/users/456", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + }) +} + +func TestSanitizePath(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "RoutePattern", + input: "/v1/secrets/:path", + expected: "/v1/secrets/:path", + }, + { + name: "EmptyPath", + input: "", + expected: "unknown", + }, + { + name: "RootPath", + input: "/", + expected: "/", + }, + { + name: "WildcardPath", + input: "/v1/secrets/*path", + expected: "/v1/secrets/*path", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := sanitizePath(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/internal/metrics/provider.go b/internal/metrics/provider.go new file mode 100644 index 0000000..f775745 --- /dev/null +++ b/internal/metrics/provider.go @@ -0,0 +1,72 @@ +// Package metrics provides OpenTelemetry metrics instrumentation with Prometheus export. +// Supports business operation metrics and HTTP request metrics for observability. +package metrics + +import ( + "context" + "fmt" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + promexporter "go.opentelemetry.io/otel/exporters/prometheus" + "go.opentelemetry.io/otel/sdk/metric" +) + +// Provider manages the OpenTelemetry meter provider and Prometheus exporter. +// Provides access to the HTTP handler for exposing metrics in Prometheus format. +type Provider struct { + meterProvider *metric.MeterProvider + exporter *promexporter.Exporter + registry *prometheus.Registry +} + +// NewProvider creates and initializes a new metrics provider with Prometheus exporter. +// The namespace parameter is used as a prefix for all metric names (e.g., "secrets"). +// Returns error if the Prometheus exporter cannot be initialized. +func NewProvider(namespace string) (*Provider, error) { + // Create custom Prometheus registry + registry := prometheus.NewRegistry() + + // Create Prometheus exporter with custom registry + exporter, err := promexporter.New( + promexporter.WithRegisterer(registry), + ) + if err != nil { + return nil, fmt.Errorf("failed to create prometheus exporter: %w", err) + } + + // Create meter provider with Prometheus exporter + meterProvider := metric.NewMeterProvider( + metric.WithReader(exporter), + ) + + return &Provider{ + meterProvider: meterProvider, + exporter: exporter, + registry: registry, + }, nil +} + +// Handler returns an HTTP handler that serves metrics in Prometheus exposition format. +// This handler should be exposed at the /metrics endpoint for Prometheus scraping. +func (p *Provider) Handler() http.Handler { + return promhttp.HandlerFor(p.registry, promhttp.HandlerOpts{ + EnableOpenMetrics: true, + }) +} + +// MeterProvider returns the OpenTelemetry meter provider for creating meters. +// Use this to obtain a meter for recording metrics in different parts of the application. +func (p *Provider) MeterProvider() *metric.MeterProvider { + return p.meterProvider +} + +// Shutdown performs cleanup of the metrics provider and flushes any pending metrics. +// Should be called during application shutdown to ensure all metrics are exported. +func (p *Provider) Shutdown(ctx context.Context) error { + if p.meterProvider == nil { + return nil + } + return p.meterProvider.Shutdown(ctx) +} diff --git a/internal/metrics/provider_test.go b/internal/metrics/provider_test.go new file mode 100644 index 0000000..09eb110 --- /dev/null +++ b/internal/metrics/provider_test.go @@ -0,0 +1,61 @@ +package metrics + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewProvider(t *testing.T) { + t.Run("Success_CreateProviderWithNamespace", func(t *testing.T) { + provider, err := NewProvider("test_app") + + require.NoError(t, err) + assert.NotNil(t, provider) + assert.NotNil(t, provider.meterProvider) + assert.NotNil(t, provider.exporter) + assert.NotNil(t, provider.registry) + }) + + t.Run("Success_CreateProviderWithEmptyNamespace", func(t *testing.T) { + provider, err := NewProvider("") + + require.NoError(t, err) + assert.NotNil(t, provider) + }) +} + +func TestProvider_MeterProvider(t *testing.T) { + provider, err := NewProvider("test_app") + require.NoError(t, err) + + meterProvider := provider.MeterProvider() + assert.NotNil(t, meterProvider) +} + +func TestProvider_Handler(t *testing.T) { + provider, err := NewProvider("test_app") + require.NoError(t, err) + + handler := provider.Handler() + assert.NotNil(t, handler) +} + +func TestProvider_Shutdown(t *testing.T) { + t.Run("Success_ShutdownProvider", func(t *testing.T) { + provider, err := NewProvider("test_app") + require.NoError(t, err) + + err = provider.Shutdown(context.Background()) + assert.NoError(t, err) + }) + + t.Run("Success_ShutdownNilProvider", func(t *testing.T) { + provider := &Provider{meterProvider: nil} + + err := provider.Shutdown(context.Background()) + assert.NoError(t, err) + }) +} diff --git a/internal/secrets/usecase/metrics_decorator.go b/internal/secrets/usecase/metrics_decorator.go new file mode 100644 index 0000000..193f59e --- /dev/null +++ b/internal/secrets/usecase/metrics_decorator.go @@ -0,0 +1,95 @@ +package usecase + +import ( + "context" + "time" + + "github.com/allisson/secrets/internal/metrics" + secretsDomain "github.com/allisson/secrets/internal/secrets/domain" +) + +// secretUseCaseWithMetrics decorates SecretUseCase with metrics instrumentation. +type secretUseCaseWithMetrics struct { + next SecretUseCase + metrics metrics.BusinessMetrics +} + +// NewSecretUseCaseWithMetrics wraps a SecretUseCase with metrics recording. +func NewSecretUseCaseWithMetrics(useCase SecretUseCase, m metrics.BusinessMetrics) SecretUseCase { + return &secretUseCaseWithMetrics{ + next: useCase, + metrics: m, + } +} + +// CreateOrUpdate records metrics for secret creation/update operations. +func (s *secretUseCaseWithMetrics) CreateOrUpdate( + ctx context.Context, + path string, + value []byte, +) (*secretsDomain.Secret, error) { + start := time.Now() + secret, err := s.next.CreateOrUpdate(ctx, path, value) + + status := "success" + if err != nil { + status = "error" + } + + s.metrics.RecordOperation(ctx, "secrets", "secret_create", status) + s.metrics.RecordDuration(ctx, "secrets", "secret_create", time.Since(start), status) + + return secret, err +} + +// Get records metrics for secret retrieval operations. +func (s *secretUseCaseWithMetrics) Get(ctx context.Context, path string) (*secretsDomain.Secret, error) { + start := time.Now() + secret, err := s.next.Get(ctx, path) + + status := "success" + if err != nil { + status = "error" + } + + s.metrics.RecordOperation(ctx, "secrets", "secret_get", status) + s.metrics.RecordDuration(ctx, "secrets", "secret_get", time.Since(start), status) + + return secret, err +} + +// GetByVersion records metrics for versioned secret retrieval operations. +func (s *secretUseCaseWithMetrics) GetByVersion( + ctx context.Context, + path string, + version uint, +) (*secretsDomain.Secret, error) { + start := time.Now() + secret, err := s.next.GetByVersion(ctx, path, version) + + status := "success" + if err != nil { + status = "error" + } + + s.metrics.RecordOperation(ctx, "secrets", "secret_get_version", status) + s.metrics.RecordDuration(ctx, "secrets", "secret_get_version", time.Since(start), status) + + return secret, err +} + +// Delete records metrics for secret deletion operations. +func (s *secretUseCaseWithMetrics) Delete(ctx context.Context, path string) error { + start := time.Now() + err := s.next.Delete(ctx, path) + + status := "success" + if err != nil { + status = "error" + } + + s.metrics.RecordOperation(ctx, "secrets", "secret_delete", status) + s.metrics.RecordDuration(ctx, "secrets", "secret_delete", time.Since(start), status) + + return err +} diff --git a/internal/transit/usecase/metrics_decorator.go b/internal/transit/usecase/metrics_decorator.go new file mode 100644 index 0000000..6452a70 --- /dev/null +++ b/internal/transit/usecase/metrics_decorator.go @@ -0,0 +1,122 @@ +package usecase + +import ( + "context" + "time" + + "github.com/google/uuid" + + cryptoDomain "github.com/allisson/secrets/internal/crypto/domain" + "github.com/allisson/secrets/internal/metrics" + transitDomain "github.com/allisson/secrets/internal/transit/domain" +) + +// transitKeyUseCaseWithMetrics decorates TransitKeyUseCase with metrics instrumentation. +type transitKeyUseCaseWithMetrics struct { + next TransitKeyUseCase + metrics metrics.BusinessMetrics +} + +// NewTransitKeyUseCaseWithMetrics wraps a TransitKeyUseCase with metrics recording. +func NewTransitKeyUseCaseWithMetrics(useCase TransitKeyUseCase, m metrics.BusinessMetrics) TransitKeyUseCase { + return &transitKeyUseCaseWithMetrics{ + next: useCase, + metrics: m, + } +} + +// Create records metrics for transit key creation operations. +func (t *transitKeyUseCaseWithMetrics) Create( + ctx context.Context, + name string, + alg cryptoDomain.Algorithm, +) (*transitDomain.TransitKey, error) { + start := time.Now() + key, err := t.next.Create(ctx, name, alg) + + status := "success" + if err != nil { + status = "error" + } + + t.metrics.RecordOperation(ctx, "transit", "transit_key_create", status) + t.metrics.RecordDuration(ctx, "transit", "transit_key_create", time.Since(start), status) + + return key, err +} + +// Rotate records metrics for transit key rotation operations. +func (t *transitKeyUseCaseWithMetrics) Rotate( + ctx context.Context, + name string, + alg cryptoDomain.Algorithm, +) (*transitDomain.TransitKey, error) { + start := time.Now() + key, err := t.next.Rotate(ctx, name, alg) + + status := "success" + if err != nil { + status = "error" + } + + t.metrics.RecordOperation(ctx, "transit", "transit_key_rotate", status) + t.metrics.RecordDuration(ctx, "transit", "transit_key_rotate", time.Since(start), status) + + return key, err +} + +// Delete records metrics for transit key deletion operations. +func (t *transitKeyUseCaseWithMetrics) Delete(ctx context.Context, transitKeyID uuid.UUID) error { + start := time.Now() + err := t.next.Delete(ctx, transitKeyID) + + status := "success" + if err != nil { + status = "error" + } + + t.metrics.RecordOperation(ctx, "transit", "transit_key_delete", status) + t.metrics.RecordDuration(ctx, "transit", "transit_key_delete", time.Since(start), status) + + return err +} + +// Encrypt records metrics for transit encryption operations. +func (t *transitKeyUseCaseWithMetrics) Encrypt( + ctx context.Context, + name string, + plaintext []byte, +) (*transitDomain.EncryptedBlob, error) { + start := time.Now() + blob, err := t.next.Encrypt(ctx, name, plaintext) + + status := "success" + if err != nil { + status = "error" + } + + t.metrics.RecordOperation(ctx, "transit", "transit_encrypt", status) + t.metrics.RecordDuration(ctx, "transit", "transit_encrypt", time.Since(start), status) + + return blob, err +} + +// Decrypt records metrics for transit decryption operations. +func (t *transitKeyUseCaseWithMetrics) Decrypt( + ctx context.Context, + name string, + ciphertext string, +) (*transitDomain.EncryptedBlob, error) { + start := time.Now() + blob, err := t.next.Decrypt(ctx, name, ciphertext) + + status := "success" + if err != nil { + status = "error" + } + + t.metrics.RecordOperation(ctx, "transit", "transit_decrypt", status) + t.metrics.RecordDuration(ctx, "transit", "transit_decrypt", time.Since(start), status) + + return blob, err +}