From 5779a876e1d167f1f8bc9b6bbe209d11d561e864 Mon Sep 17 00:00:00 2001 From: urismiley Date: Mon, 23 Feb 2026 02:26:22 -0500 Subject: [PATCH 1/5] docs: add monitoring overview and metrics reference guides Signed-off-by: urismiley --- .../preview/monitoring/metrics.md | 311 ++++++++++++++++++ .../preview/monitoring/overview.md | 285 ++++++++++++++++ mkdocs.yml | 3 + 3 files changed, 599 insertions(+) create mode 100644 docs/operator-public-documentation/preview/monitoring/metrics.md create mode 100644 docs/operator-public-documentation/preview/monitoring/overview.md diff --git a/docs/operator-public-documentation/preview/monitoring/metrics.md b/docs/operator-public-documentation/preview/monitoring/metrics.md new file mode 100644 index 00000000..ae464f24 --- /dev/null +++ b/docs/operator-public-documentation/preview/monitoring/metrics.md @@ -0,0 +1,311 @@ +# Metrics Reference + +This page documents the key metrics available when monitoring a DocumentDB cluster, organized by source. Each section includes the metric name, description, labels, and example PromQL queries. + +## Container Resource Metrics + +These metrics are collected via the kubelet/cAdvisor interface (or the OpenTelemetry `kubeletstats` receiver). They cover CPU, memory, network, and filesystem for the **postgres** and **gateway** containers in each DocumentDB pod. + +### CPU + +| Metric | Type | Description | +|--------|------|-------------| +| `container_cpu_usage_seconds_total` | Counter | Cumulative CPU time consumed in seconds | +| `container_spec_cpu_quota` | Gauge | CPU quota (microseconds per `cpu_period`) | +| `container_spec_cpu_period` | Gauge | CPU CFS scheduling period (microseconds) | + +**Common labels:** `namespace`, `pod`, `container`, `node` + +#### Example Queries + +CPU usage rate per container over 5 minutes: + +```promql +rate(container_cpu_usage_seconds_total{ + container=~"postgres|gateway", + pod=~".*documentdb.*" +}[5m]) +``` + +CPU utilization as a percentage of limit: + +```promql +(rate(container_cpu_usage_seconds_total{ + container="postgres", + pod=~".*documentdb.*" +}[5m]) +/ on(pod, container) +(container_spec_cpu_quota{ + container="postgres", + pod=~".*documentdb.*" +} / 1e5)) * 100 +``` + +Compare gateway vs. postgres CPU across all pods: + +```promql +sum by (container) ( + rate(container_cpu_usage_seconds_total{ + container=~"postgres|gateway", + pod=~".*documentdb.*" + }[5m]) +) +``` + +### Memory + +| Metric | Type | Description | +|--------|------|-------------| +| `container_memory_working_set_bytes` | Gauge | Current working set memory (bytes) | +| `container_memory_rss` | Gauge | Resident set size (bytes) | +| `container_memory_cache` | Gauge | Page cache memory (bytes) | +| `container_spec_memory_limit_bytes` | Gauge | Memory limit (bytes) | + +**Common labels:** `namespace`, `pod`, `container`, `node` + +#### Example Queries + +Memory usage in MiB per container: + +```promql +container_memory_working_set_bytes{ + container=~"postgres|gateway", + pod=~".*documentdb.*" +} / 1024 / 1024 +``` + +Memory utilization as a percentage of limit: + +```promql +(container_memory_working_set_bytes{ + container=~"postgres|gateway", + pod=~".*documentdb.*" +} +/ container_spec_memory_limit_bytes{ + container=~"postgres|gateway", + pod=~".*documentdb.*" +}) * 100 +``` + +Top 5 pods by memory usage: + +```promql +topk(5, + sum by (pod) ( + container_memory_working_set_bytes{ + container=~"postgres|gateway", + pod=~".*documentdb.*" + } + ) +) +``` + +### Network + +| Metric | Type | Description | +|--------|------|-------------| +| `container_network_receive_bytes_total` | Counter | Bytes received | +| `container_network_transmit_bytes_total` | Counter | Bytes transmitted | + +**Common labels:** `namespace`, `pod`, `interface` + +#### Example Queries + +Network throughput (bytes/sec) per pod: + +```promql +sum by (pod) ( + rate(container_network_receive_bytes_total{ + pod=~".*documentdb.*" + }[5m]) + + rate(container_network_transmit_bytes_total{ + pod=~".*documentdb.*" + }[5m]) +) +``` + +### Filesystem + +| Metric | Type | Description | +|--------|------|-------------| +| `container_fs_usage_bytes` | Gauge | Filesystem usage (bytes) | +| `container_fs_reads_bytes_total` | Counter | Filesystem read bytes | +| `container_fs_writes_bytes_total` | Counter | Filesystem write bytes | + +**Common labels:** `namespace`, `pod`, `container`, `device` + +#### Example Queries + +Disk I/O rate for the postgres container: + +```promql +rate(container_fs_writes_bytes_total{ + container="postgres", + pod=~".*documentdb.*" +}[5m]) +``` + +## Operator Metrics (controller-runtime) + +The DocumentDB operator binary exposes standard controller-runtime metrics on its metrics endpoint. These track reconciliation performance and work queue health. + +### Reconciliation + +| Metric | Type | Description | +|--------|------|-------------| +| `controller_runtime_reconcile_total` | Counter | Total reconciliations | +| `controller_runtime_reconcile_errors_total` | Counter | Total reconciliation errors | +| `controller_runtime_reconcile_time_seconds` | Histogram | Time spent in reconciliation | + +**Common labels:** `controller` (e.g., `documentdb`, `backup`, `scheduledbackup`, `certificate`, `persistentvolume`), `result` (`success`, `error`, `requeue`, `requeue_after`) + +#### Example Queries + +Reconciliation error rate by controller: + +```promql +sum by (controller) ( + rate(controller_runtime_reconcile_errors_total[5m]) +) +``` + +P95 reconciliation latency for the DocumentDB controller: + +```promql +histogram_quantile(0.95, + sum by (le) ( + rate(controller_runtime_reconcile_time_seconds_bucket{ + controller="documentdb" + }[5m]) + ) +) +``` + +Reconciliation throughput (reconciles/sec): + +```promql +sum by (controller) ( + rate(controller_runtime_reconcile_total[5m]) +) +``` + +### Work Queue + +| Metric | Type | Description | +|--------|------|-------------| +| `workqueue_depth` | Gauge | Current number of items in the queue | +| `workqueue_adds_total` | Counter | Total items added | +| `workqueue_queue_duration_seconds` | Histogram | Time items spend in queue before processing | +| `workqueue_work_duration_seconds` | Histogram | Time spent processing items | +| `workqueue_retries_total` | Counter | Total retries | + +**Common labels:** `name` (queue name, maps to controller name) + +#### Example Queries + +Work queue depth by controller: + +```promql +workqueue_depth{name=~"documentdb|backup|scheduledbackup|certificate"} +``` + +Average time items spend waiting in queue: + +```promql +rate(workqueue_queue_duration_seconds_sum{name="documentdb"}[5m]) +/ rate(workqueue_queue_duration_seconds_count{name="documentdb"}[5m]) +``` + +## CNPG / PostgreSQL Metrics + +CloudNative-PG exposes PostgreSQL-level metrics from each managed pod. These are available when CNPG monitoring is enabled. For the full list, see the [CloudNative-PG monitoring docs](https://cloudnative-pg.io/documentation/current/monitoring/). + +### Replication + +| Metric | Type | Description | +|--------|------|-------------| +| `cnpg_pg_replication_lag` | Gauge | Replication lag in seconds | +| `cnpg_pg_replication_streaming_replicas` | Gauge | Number of streaming replicas | + +#### Example Queries + +Replication lag per pod: + +```promql +cnpg_pg_replication_lag{pod=~".*documentdb.*"} +``` + +### Connections + +| Metric | Type | Description | +|--------|------|-------------| +| `cnpg_pg_stat_activity_count` | Gauge | Active backend connections by state | + +#### Example Queries + +Active connections by state: + +```promql +sum by (state) ( + cnpg_pg_stat_activity_count{pod=~".*documentdb.*"} +) +``` + +### Storage + +| Metric | Type | Description | +|--------|------|-------------| +| `cnpg_pg_database_size_bytes` | Gauge | Total database size | +| `cnpg_pg_stat_bgwriter_buffers_checkpoint` | Counter | Buffers written during checkpoints | + +#### Example Queries + +Database size in GiB: + +```promql +cnpg_pg_database_size_bytes{pod=~".*documentdb.*"} / 1024 / 1024 / 1024 +``` + +### Cluster Health + +| Metric | Type | Description | +|--------|------|-------------| +| `cnpg_collector_up` | Gauge | 1 if the CNPG metrics collector is running | +| `cnpg_pg_postmaster_start_time` | Gauge | PostgreSQL start timestamp | + +#### Example Queries + +Detect pods where the metrics collector is down: + +```promql +cnpg_collector_up{pod=~".*documentdb.*"} == 0 +``` + +## Gateway Metrics (Future) + +The DocumentDB Gateway does not currently expose application-level metrics. When implemented, expect metrics like: + +| Metric | Type | Description | +|--------|------|-------------| +| `documentdb_gateway_requests_total` | Counter | Total API requests (labels: `method`, `status`) | +| `documentdb_gateway_request_duration_seconds` | Histogram | Request latency | +| `documentdb_gateway_active_connections` | Gauge | Current connection count | +| `documentdb_gateway_read_operations_total` | Counter | Read operations (labels: `database`, `collection`) | +| `documentdb_gateway_write_operations_total` | Counter | Write operations (labels: `database`, `collection`) | +| `documentdb_gateway_errors_total` | Counter | Error count (labels: `error_type`, `operation`) | + +These will be collected via Prometheus scraping (`/metrics` endpoint) or OTLP push. See the [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the planned implementation. + +## OpenTelemetry Metric Names + +When using the OpenTelemetry `kubeletstats` receiver, metric names use the OpenTelemetry naming convention instead of Prometheus-style names: + +| OpenTelemetry Name | Prometheus Equivalent | +|---|---| +| `k8s.container.cpu.time` | `container_cpu_usage_seconds_total` | +| `k8s.container.memory.usage` | `container_memory_working_set_bytes` | +| `k8s.container.cpu.limit` | `container_spec_cpu_quota` | +| `k8s.container.memory.limit` | `container_spec_memory_limit_bytes` | +| `k8s.pod.network.io` | `container_network_*_bytes_total` | + +When writing queries, use the naming convention matching your collection method. The telemetry playground uses the OpenTelemetry names; a direct Prometheus scrape of cAdvisor uses Prometheus names. diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md new file mode 100644 index 00000000..7e1422c8 --- /dev/null +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -0,0 +1,285 @@ +# Monitoring Overview + +This guide describes how to monitor DocumentDB clusters running on Kubernetes using OpenTelemetry, Prometheus, and Grafana. + +## Prerequisites + +- A running Kubernetes cluster with the DocumentDB operator installed +- [Helm 3](https://helm.sh/docs/intro/install/) for deploying Prometheus and Grafana +- [kubectl](https://kubernetes.io/docs/tasks/tools/) configured for your cluster +- (Optional) [OpenTelemetry Operator](https://opentelemetry.io/docs/kubernetes/operator/) for managed collector deployments + +## Architecture + +A DocumentDB pod contains two containers: + +- **PostgreSQL container** — the DocumentDB engine (PostgreSQL with DocumentDB extensions) +- **Gateway container** — MongoDB-compatible API sidecar + +The recommended monitoring stack collects infrastructure metrics from these containers and stores them for visualization and alerting. + +``` +┌──────────────────────────────────────────────────────┐ +│ Grafana │ +│ (dashboards & alerts) │ +└──────────────────┬───────────────────────────────────┘ + │ +┌──────────────────┴───────────────────────────────────┐ +│ Prometheus │ +│ (metrics storage) │ +└──────────────────┬───────────────────────────────────┘ + │ remote write +┌──────────────────┴───────────────────────────────────┐ +│ OpenTelemetry Collector (DaemonSet) │ +│ Receivers: kubeletstats, k8s_cluster, prometheus │ +│ Processors: resource detection, attribute enrichment │ +│ Exporters: prometheusremotewrite │ +└──────────────────┬───────────────────────────────────┘ + │ scrape +┌──────────────────┴───────────────────────────────────┐ +│ DocumentDB Pods │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ PostgreSQL │ │ Gateway │ │ +│ │ container │ │ container │ │ +│ └──────────────┘ └──────────────┘ │ +└──────────────────────────────────────────────────────┘ +``` + +### Why DaemonSet over sidecar + +The OpenTelemetry Collector runs as a **DaemonSet** (one collector per node) rather than as a sidecar per pod. This provides: + +- Lower resource overhead — one collector per node instead of one per pod +- Node-level metrics visibility (CPU, memory, filesystem) +- Simpler configuration and management + +For multi-tenant setups requiring per-namespace isolation, a **Deployment** per namespace is used instead. See the [telemetry playground](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) for both patterns. + +## Prometheus Integration + +### Operator Metrics + +The DocumentDB operator exposes a metrics endpoint via controller-runtime. By default: + +- **Bind address**: controlled by `--metrics-bind-address` (default `0`, disabled) +- **Secure mode**: `--metrics-secure=true` serves via HTTPS with authn/authz +- **Certificates**: supply `--metrics-cert-path` for custom TLS, otherwise self-signed certs are generated + +To enable metrics scraping, set the bind address in the operator deployment (for example, `:8443` for HTTPS or `:8080` for HTTP). + +### CNPG Cluster Metrics + +The underlying CloudNative-PG cluster exposes PostgreSQL metrics on each pod. These are collected by the OpenTelemetry Collector's Prometheus receiver via Kubernetes service discovery. Key metric sources: + +| Source | Method | Metrics | +|--------|--------|---------| +| kubelet/cAdvisor | `kubeletstats` receiver | Container CPU, memory, network, filesystem | +| Kubernetes API | `k8s_cluster` receiver | Pod status, restart counts, resource requests/limits | +| Application endpoints | `prometheus` receiver | Custom application metrics (when available) | + +### ServiceMonitor / PodMonitor + +If you use the Prometheus Operator, create a `ServiceMonitor` targeting the operator's metrics service: + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: documentdb-operator + namespace: documentdb-operator +spec: + selector: + matchLabels: + app: documentdb-operator + endpoints: + - port: metrics + scheme: https + tlsConfig: + insecureSkipVerify: true +``` + +## Key Metrics + +### Container Resource Metrics + +| Metric | Description | Container | +|--------|-------------|-----------| +| `container_cpu_usage_seconds_total` | Cumulative CPU time consumed | postgres, gateway | +| `container_memory_working_set_bytes` | Current memory usage | postgres, gateway | +| `container_spec_memory_limit_bytes` | Memory limit | postgres, gateway | +| `container_network_receive_bytes_total` | Network bytes received | pod-level | +| `container_fs_reads_bytes_total` | Filesystem read bytes | postgres | + +### Controller-Runtime Metrics + +| Metric | Description | +|--------|-------------| +| `controller_runtime_reconcile_total` | Total reconciliations by controller and result | +| `controller_runtime_reconcile_errors_total` | Total reconciliation errors | +| `controller_runtime_reconcile_time_seconds` | Reconciliation duration histogram | +| `workqueue_depth` | Current depth of the work queue | +| `workqueue_adds_total` | Total items added to the work queue | + +### CNPG / PostgreSQL Metrics + +When the CNPG monitoring is enabled, additional PostgreSQL-level metrics are available: + +| Metric | Description | +|--------|-------------| +| `cnpg_collector_up` | Whether the CNPG metrics collector is running | +| `cnpg_pg_replication_lag` | Replication lag in seconds | +| `cnpg_pg_stat_activity_count` | Number of active connections | +| `cnpg_pg_database_size_bytes` | Database size | + +For the full CNPG metrics reference, see the [CloudNative-PG monitoring documentation](https://cloudnative-pg.io/documentation/current/monitoring/). + +## Alerts + +### Recommended Alert Rules + +```yaml +groups: + - name: documentdb.alerts + rules: + - alert: DocumentDBHighCPU + expr: | + (rate(container_cpu_usage_seconds_total{ + container=~"postgres|gateway", + pod=~".*documentdb.*" + }[5m]) + / on(pod, container) container_spec_cpu_quota{ + container=~"postgres|gateway", + pod=~".*documentdb.*" + } * 1e5) > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU on {{ $labels.pod }}/{{ $labels.container }}" + + - alert: DocumentDBHighMemory + expr: | + (container_memory_working_set_bytes{ + container=~"postgres|gateway", + pod=~".*documentdb.*" + } + / container_spec_memory_limit_bytes{ + container=~"postgres|gateway", + pod=~".*documentdb.*" + }) > 0.85 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory on {{ $labels.pod }}/{{ $labels.container }}" + + - alert: DocumentDBPodRestarting + expr: | + increase(kube_pod_container_status_restarts_total{ + pod=~".*documentdb.*" + }[1h]) > 3 + labels: + severity: critical + annotations: + summary: "{{ $labels.pod }} restarted {{ $value }} times in the last hour" + + - alert: DocumentDBReconcileErrors + expr: | + rate(controller_runtime_reconcile_errors_total{ + controller="documentdb" + }[5m]) > 0 + for: 10m + labels: + severity: warning + annotations: + summary: "DocumentDB controller has reconciliation errors" +``` + +### Recording Rules + +Pre-compute common queries with recording rules for dashboard efficiency: + +```yaml +groups: + - name: documentdb.rules + rules: + - record: documentdb:cpu_usage_rate5m + expr: | + rate(container_cpu_usage_seconds_total{ + container=~"postgres|gateway", + pod=~".*documentdb.*" + }[5m]) + + - record: documentdb:memory_usage_bytes + expr: | + container_memory_working_set_bytes{ + container=~"postgres|gateway", + pod=~".*documentdb.*" + } + + - record: documentdb:memory_utilization_percent + expr: | + (documentdb:memory_usage_bytes + / container_spec_memory_limit_bytes{ + container=~"postgres|gateway", + pod=~".*documentdb.*" + }) * 100 +``` + +## Telemetry Playground + +The [`documentdb-playground/telemetry/`](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) directory contains a complete reference implementation with: + +- Multi-tenant namespace isolation (separate Prometheus + Grafana per team) +- OpenTelemetry Collector configurations for cAdvisor metric scraping +- Automated Grafana dashboard provisioning scripts +- AKS cluster setup with the OpenTelemetry Operator + +Run the quickstart: + +```bash +cd documentdb-playground/telemetry/scripts/ + +# One-time infrastructure setup +./create-cluster.sh --install-all + +# Deploy multi-tenant DocumentDB + monitoring +./deploy-multi-tenant-telemetry.sh + +# Create Grafana dashboards +./setup-grafana-dashboards.sh sales-namespace + +# Access Grafana +kubectl port-forward -n sales-namespace svc/grafana-sales 3001:3000 & +# Open http://localhost:3001 (admin / admin123) +``` + +See the [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the full architecture rationale including DaemonSet vs. sidecar trade-offs, OTLP receiver plans, and future application-level metrics. + +## Verification + +After deploying the monitoring stack, confirm that metrics are flowing: + +```bash +# Check that the OpenTelemetry Collector pods are running +kubectl get pods -l app.kubernetes.io/name=opentelemetry-collector + +# Verify Prometheus is receiving metrics (port-forward first) +kubectl port-forward svc/prometheus-server 9090:80 & +curl -s 'http://localhost:9090/api/v1/query?query=up' | jq '.data.result | length' + +# Confirm DocumentDB container metrics are present +curl -s 'http://localhost:9090/api/v1/query?query=container_cpu_usage_seconds_total{pod=~".*documentdb.*"}' \ + | jq '.data.result | length' +``` + +If no metrics appear, check: + +- The collector's service account has RBAC access to the kubelet metrics API +- Namespace label filters in the collector config match your DocumentDB namespace +- The Prometheus remote-write endpoint is reachable from the collector + +## Next Steps + +- [Metrics Reference](metrics.md) — detailed metric descriptions and PromQL query examples +- [CloudNative-PG Monitoring](https://cloudnative-pg.io/documentation/current/monitoring/) — upstream PostgreSQL metrics diff --git a/mkdocs.yml b/mkdocs.yml index efcb23f1..6dcd7e3f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,6 +10,9 @@ nav: - Get Started: preview/index.md - Advanced Configuration: preview/advanced-configuration/README.md - Backup and Restore: preview/backup-and-restore.md + - Monitoring: + - Overview: preview/monitoring/overview.md + - Metrics Reference: preview/monitoring/metrics.md - FAQ: preview/faq.md - Tools: - Kubectl Plugin: preview/kubectl-plugin.md From 086e5e9a590409494330bb16e6d2cd2649aa71c8 Mon Sep 17 00:00:00 2001 From: urismiley Date: Mon, 23 Feb 2026 03:04:51 -0500 Subject: [PATCH 2/5] fix: correct container and controller names in monitoring docs Signed-off-by: urismiley --- .../preview/monitoring/metrics.md | 24 +++++++-------- .../preview/monitoring/overview.md | 30 +++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/operator-public-documentation/preview/monitoring/metrics.md b/docs/operator-public-documentation/preview/monitoring/metrics.md index ae464f24..493c0c61 100644 --- a/docs/operator-public-documentation/preview/monitoring/metrics.md +++ b/docs/operator-public-documentation/preview/monitoring/metrics.md @@ -4,7 +4,7 @@ This page documents the key metrics available when monitoring a DocumentDB clust ## Container Resource Metrics -These metrics are collected via the kubelet/cAdvisor interface (or the OpenTelemetry `kubeletstats` receiver). They cover CPU, memory, network, and filesystem for the **postgres** and **gateway** containers in each DocumentDB pod. +These metrics are collected via the kubelet/cAdvisor interface (or the OpenTelemetry `kubeletstats` receiver). They cover CPU, memory, network, and filesystem for the **postgres** and **documentdb-gateway** containers in each DocumentDB pod. ### CPU @@ -22,7 +22,7 @@ CPU usage rate per container over 5 minutes: ```promql rate(container_cpu_usage_seconds_total{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" }[5m]) ``` @@ -46,7 +46,7 @@ Compare gateway vs. postgres CPU across all pods: ```promql sum by (container) ( rate(container_cpu_usage_seconds_total{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" }[5m]) ) @@ -69,7 +69,7 @@ Memory usage in MiB per container: ```promql container_memory_working_set_bytes{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" } / 1024 / 1024 ``` @@ -78,11 +78,11 @@ Memory utilization as a percentage of limit: ```promql (container_memory_working_set_bytes{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" } / container_spec_memory_limit_bytes{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" }) * 100 ``` @@ -93,7 +93,7 @@ Top 5 pods by memory usage: topk(5, sum by (pod) ( container_memory_working_set_bytes{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" } ) @@ -157,7 +157,7 @@ The DocumentDB operator binary exposes standard controller-runtime metrics on it | `controller_runtime_reconcile_errors_total` | Counter | Total reconciliation errors | | `controller_runtime_reconcile_time_seconds` | Histogram | Time spent in reconciliation | -**Common labels:** `controller` (e.g., `documentdb`, `backup`, `scheduledbackup`, `certificate`, `persistentvolume`), `result` (`success`, `error`, `requeue`, `requeue_after`) +**Common labels:** `controller` (e.g., `documentdb-controller`, `backup`, `scheduledbackup`, `certificate-controller`, `persistentvolume`), `result` (`success`, `error`, `requeue`, `requeue_after`) #### Example Queries @@ -175,7 +175,7 @@ P95 reconciliation latency for the DocumentDB controller: histogram_quantile(0.95, sum by (le) ( rate(controller_runtime_reconcile_time_seconds_bucket{ - controller="documentdb" + controller="documentdb-controller" }[5m]) ) ) @@ -206,14 +206,14 @@ sum by (controller) ( Work queue depth by controller: ```promql -workqueue_depth{name=~"documentdb|backup|scheduledbackup|certificate"} +workqueue_depth{name=~"documentdb-controller|backup|scheduledbackup|certificate-controller"} ``` Average time items spend waiting in queue: ```promql -rate(workqueue_queue_duration_seconds_sum{name="documentdb"}[5m]) -/ rate(workqueue_queue_duration_seconds_count{name="documentdb"}[5m]) +rate(workqueue_queue_duration_seconds_sum{name="documentdb-controller"}[5m]) +/ rate(workqueue_queue_duration_seconds_count{name="documentdb-controller"}[5m]) ``` ## CNPG / PostgreSQL Metrics diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index 7e1422c8..77ae54dd 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -30,7 +30,7 @@ The recommended monitoring stack collects infrastructure metrics from these cont └──────────────────┬───────────────────────────────────┘ │ remote write ┌──────────────────┴───────────────────────────────────┐ -│ OpenTelemetry Collector (DaemonSet) │ +│ OpenTelemetry Collector │ │ Receivers: kubeletstats, k8s_cluster, prometheus │ │ Processors: resource detection, attribute enrichment │ │ Exporters: prometheusremotewrite │ @@ -45,15 +45,15 @@ The recommended monitoring stack collects infrastructure metrics from these cont └──────────────────────────────────────────────────────┘ ``` -### Why DaemonSet over sidecar +### Collector deployment modes -The OpenTelemetry Collector runs as a **DaemonSet** (one collector per node) rather than as a sidecar per pod. This provides: +The [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) recommends the OpenTelemetry Collector as a **DaemonSet** (one collector per node) for single-tenant clusters. This provides: - Lower resource overhead — one collector per node instead of one per pod - Node-level metrics visibility (CPU, memory, filesystem) - Simpler configuration and management -For multi-tenant setups requiring per-namespace isolation, a **Deployment** per namespace is used instead. See the [telemetry playground](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) for both patterns. +The [telemetry playground](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) implements a **Deployment** (one collector per namespace) instead, which is better suited for multi-tenant setups requiring per-namespace metric isolation. Choose the mode that fits your isolation requirements. ## Prometheus Integration @@ -104,9 +104,9 @@ spec: | Metric | Description | Container | |--------|-------------|-----------| -| `container_cpu_usage_seconds_total` | Cumulative CPU time consumed | postgres, gateway | -| `container_memory_working_set_bytes` | Current memory usage | postgres, gateway | -| `container_spec_memory_limit_bytes` | Memory limit | postgres, gateway | +| `container_cpu_usage_seconds_total` | Cumulative CPU time consumed | postgres, documentdb-gateway | +| `container_memory_working_set_bytes` | Current memory usage | postgres, documentdb-gateway | +| `container_spec_memory_limit_bytes` | Memory limit | postgres, documentdb-gateway | | `container_network_receive_bytes_total` | Network bytes received | pod-level | | `container_fs_reads_bytes_total` | Filesystem read bytes | postgres | @@ -144,11 +144,11 @@ groups: - alert: DocumentDBHighCPU expr: | (rate(container_cpu_usage_seconds_total{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" }[5m]) / on(pod, container) container_spec_cpu_quota{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" } * 1e5) > 0.8 for: 5m @@ -160,11 +160,11 @@ groups: - alert: DocumentDBHighMemory expr: | (container_memory_working_set_bytes{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" } / container_spec_memory_limit_bytes{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" }) > 0.85 for: 5m @@ -186,7 +186,7 @@ groups: - alert: DocumentDBReconcileErrors expr: | rate(controller_runtime_reconcile_errors_total{ - controller="documentdb" + controller="documentdb-controller" }[5m]) > 0 for: 10m labels: @@ -206,14 +206,14 @@ groups: - record: documentdb:cpu_usage_rate5m expr: | rate(container_cpu_usage_seconds_total{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" }[5m]) - record: documentdb:memory_usage_bytes expr: | container_memory_working_set_bytes{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" } @@ -221,7 +221,7 @@ groups: expr: | (documentdb:memory_usage_bytes / container_spec_memory_limit_bytes{ - container=~"postgres|gateway", + container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" }) * 100 ``` From 56c9eaf9e2fffd781de85f99a26cf58cad71af9a Mon Sep 17 00:00:00 2001 From: udsmicrosoft <136555787+udsmicrosoft@users.noreply.github.com> Date: Mon, 23 Feb 2026 03:17:56 -0500 Subject: [PATCH 3/5] Update docs/operator-public-documentation/preview/monitoring/overview.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: urismiley --- .../preview/monitoring/overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index 77ae54dd..5a14c431 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -254,7 +254,7 @@ kubectl port-forward -n sales-namespace svc/grafana-sales 3001:3000 & # Open http://localhost:3001 (admin / admin123) ``` -See the [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the full architecture rationale including DaemonSet vs. sidecar trade-offs, OTLP receiver plans, and future application-level metrics. +See the [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the full architecture rationale including DaemonSet vs. sidecar trade-offs, OTLP receiver plans, and future application-level metrics. ## Verification From b5468d11075205fe24fd65b25e53d4f7db9d2498 Mon Sep 17 00:00:00 2001 From: urismiley Date: Mon, 23 Feb 2026 03:25:13 -0500 Subject: [PATCH 4/5] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?= =?UTF-8?q?links,=20labels,=20CPU=20queries,=20prerequisites?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: urismiley --- .../preview/monitoring/metrics.md | 8 +++- .../preview/monitoring/overview.md | 42 +++++++++++++++---- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/docs/operator-public-documentation/preview/monitoring/metrics.md b/docs/operator-public-documentation/preview/monitoring/metrics.md index 493c0c61..95540c0c 100644 --- a/docs/operator-public-documentation/preview/monitoring/metrics.md +++ b/docs/operator-public-documentation/preview/monitoring/metrics.md @@ -38,7 +38,11 @@ CPU utilization as a percentage of limit: (container_spec_cpu_quota{ container="postgres", pod=~".*documentdb.*" -} / 1e5)) * 100 +} +/ container_spec_cpu_period{ + container="postgres", + pod=~".*documentdb.*" +})) * 100 ``` Compare gateway vs. postgres CPU across all pods: @@ -294,7 +298,7 @@ The DocumentDB Gateway does not currently expose application-level metrics. When | `documentdb_gateway_write_operations_total` | Counter | Write operations (labels: `database`, `collection`) | | `documentdb_gateway_errors_total` | Counter | Error count (labels: `error_type`, `operation`) | -These will be collected via Prometheus scraping (`/metrics` endpoint) or OTLP push. See the [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the planned implementation. +These will be collected via Prometheus scraping (`/metrics` endpoint) or OTLP push. See the [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the planned implementation. ## OpenTelemetry Metric Names diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index 5a14c431..c3d14b93 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -7,6 +7,7 @@ This guide describes how to monitor DocumentDB clusters running on Kubernetes us - A running Kubernetes cluster with the DocumentDB operator installed - [Helm 3](https://helm.sh/docs/intro/install/) for deploying Prometheus and Grafana - [kubectl](https://kubernetes.io/docs/tasks/tools/) configured for your cluster +- [`jq`](https://jqlang.github.io/jq/) for processing JSON in verification commands - (Optional) [OpenTelemetry Operator](https://opentelemetry.io/docs/kubernetes/operator/) for managed collector deployments ## Architecture @@ -47,13 +48,13 @@ The recommended monitoring stack collects infrastructure metrics from these cont ### Collector deployment modes -The [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) recommends the OpenTelemetry Collector as a **DaemonSet** (one collector per node) for single-tenant clusters. This provides: +The [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) recommends the OpenTelemetry Collector as a **DaemonSet** (one collector per node) for single-tenant clusters. This provides: - Lower resource overhead — one collector per node instead of one per pod - Node-level metrics visibility (CPU, memory, filesystem) - Simpler configuration and management -The [telemetry playground](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) implements a **Deployment** (one collector per namespace) instead, which is better suited for multi-tenant setups requiring per-namespace metric isolation. Choose the mode that fits your isolation requirements. +The [telemetry playground](https://github.com/documentdb/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) implements a **Deployment** (one collector per namespace) instead, which is better suited for multi-tenant setups requiring per-namespace metric isolation. Choose the mode that fits your isolation requirements. ## Prometheus Integration @@ -79,9 +80,24 @@ The underlying CloudNative-PG cluster exposes PostgreSQL metrics on each pod. Th ### ServiceMonitor / PodMonitor -If you use the Prometheus Operator, create a `ServiceMonitor` targeting the operator's metrics service: +The operator does not ship a metrics `Service` or `ServiceMonitor` by default. If you use the Prometheus Operator and want to scrape controller-runtime metrics, create a `Service` and `ServiceMonitor` matching your deployment. For example, with a Helm release named `documentdb`: ```yaml +apiVersion: v1 +kind: Service +metadata: + name: documentdb-operator-metrics + namespace: documentdb-operator + labels: + app: documentdb +spec: + selector: + app: documentdb # must match your Helm release name + ports: + - name: metrics + port: 8443 + targetPort: 8443 +--- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: @@ -90,14 +106,17 @@ metadata: spec: selector: matchLabels: - app: documentdb-operator + app: documentdb # must match the Service labels above endpoints: - port: metrics scheme: https tlsConfig: - insecureSkipVerify: true + insecureSkipVerify: true # use a proper CA bundle in production ``` +!!! note + Adjust the `app` label to match your Helm release name. The operator must be started with `--metrics-bind-address=:8443` for the endpoint to be available. + ## Key Metrics ### Container Resource Metrics @@ -147,10 +166,15 @@ groups: container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" }[5m]) - / on(pod, container) container_spec_cpu_quota{ + / on(pod, container) + (container_spec_cpu_quota{ + container=~"postgres|documentdb-gateway", + pod=~".*documentdb.*" + } + / container_spec_cpu_period{ container=~"postgres|documentdb-gateway", pod=~".*documentdb.*" - } * 1e5) > 0.8 + })) > 0.8 for: 5m labels: severity: warning @@ -228,7 +252,7 @@ groups: ## Telemetry Playground -The [`documentdb-playground/telemetry/`](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) directory contains a complete reference implementation with: +The [`documentdb-playground/telemetry/`](https://github.com/documentdb/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) directory contains a complete reference implementation with: - Multi-tenant namespace isolation (separate Prometheus + Grafana per team) - OpenTelemetry Collector configurations for cAdvisor metric scraping @@ -251,7 +275,7 @@ cd documentdb-playground/telemetry/scripts/ # Access Grafana kubectl port-forward -n sales-namespace svc/grafana-sales 3001:3000 & -# Open http://localhost:3001 (admin / admin123) +# Open http://localhost:3001 (playground default: admin / admin123 — change in production) ``` See the [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the full architecture rationale including DaemonSet vs. sidecar trade-offs, OTLP receiver plans, and future application-level metrics. From 28fa4726daa7e9429872caf3a81ffc0922d29df0 Mon Sep 17 00:00:00 2001 From: urismiley Date: Tue, 24 Feb 2026 12:57:55 -0500 Subject: [PATCH 5/5] docs: remove Alerts section from monitoring overview Signed-off-by: urismiley --- .../preview/monitoring/overview.md | 98 ------------------- 1 file changed, 98 deletions(-) diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index c3d14b93..6a67361b 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -152,104 +152,6 @@ When the CNPG monitoring is enabled, additional PostgreSQL-level metrics are ava For the full CNPG metrics reference, see the [CloudNative-PG monitoring documentation](https://cloudnative-pg.io/documentation/current/monitoring/). -## Alerts - -### Recommended Alert Rules - -```yaml -groups: - - name: documentdb.alerts - rules: - - alert: DocumentDBHighCPU - expr: | - (rate(container_cpu_usage_seconds_total{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - }[5m]) - / on(pod, container) - (container_spec_cpu_quota{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - } - / container_spec_cpu_period{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - })) > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: "High CPU on {{ $labels.pod }}/{{ $labels.container }}" - - - alert: DocumentDBHighMemory - expr: | - (container_memory_working_set_bytes{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - } - / container_spec_memory_limit_bytes{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - }) > 0.85 - for: 5m - labels: - severity: warning - annotations: - summary: "High memory on {{ $labels.pod }}/{{ $labels.container }}" - - - alert: DocumentDBPodRestarting - expr: | - increase(kube_pod_container_status_restarts_total{ - pod=~".*documentdb.*" - }[1h]) > 3 - labels: - severity: critical - annotations: - summary: "{{ $labels.pod }} restarted {{ $value }} times in the last hour" - - - alert: DocumentDBReconcileErrors - expr: | - rate(controller_runtime_reconcile_errors_total{ - controller="documentdb-controller" - }[5m]) > 0 - for: 10m - labels: - severity: warning - annotations: - summary: "DocumentDB controller has reconciliation errors" -``` - -### Recording Rules - -Pre-compute common queries with recording rules for dashboard efficiency: - -```yaml -groups: - - name: documentdb.rules - rules: - - record: documentdb:cpu_usage_rate5m - expr: | - rate(container_cpu_usage_seconds_total{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - }[5m]) - - - record: documentdb:memory_usage_bytes - expr: | - container_memory_working_set_bytes{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - } - - - record: documentdb:memory_utilization_percent - expr: | - (documentdb:memory_usage_bytes - / container_spec_memory_limit_bytes{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - }) * 100 -``` - ## Telemetry Playground The [`documentdb-playground/telemetry/`](https://github.com/documentdb/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) directory contains a complete reference implementation with: