From 5779a876e1d167f1f8bc9b6bbe209d11d561e864 Mon Sep 17 00:00:00 2001
From: urismiley <urismiley@microsoft.com>
Date: Mon, 23 Feb 2026 02:26:22 -0500
Subject: [PATCH 1/5] docs: add monitoring overview and metrics reference
 guides

Signed-off-by: urismiley <urismiley@microsoft.com>
---
 .../preview/monitoring/metrics.md             | 311 ++++++++++++++++++
 .../preview/monitoring/overview.md            | 285 ++++++++++++++++
 mkdocs.yml                                    |   3 +
 3 files changed, 599 insertions(+)
 create mode 100644 docs/operator-public-documentation/preview/monitoring/metrics.md
 create mode 100644 docs/operator-public-documentation/preview/monitoring/overview.md

diff --git a/docs/operator-public-documentation/preview/monitoring/metrics.md b/docs/operator-public-documentation/preview/monitoring/metrics.md
new file mode 100644
index 00000000..ae464f24
--- /dev/null
+++ b/docs/operator-public-documentation/preview/monitoring/metrics.md
@@ -0,0 +1,311 @@
+# Metrics Reference
+
+This page documents the key metrics available when monitoring a DocumentDB cluster, organized by source. Each section includes the metric name, description, labels, and example PromQL queries.
+
+## Container Resource Metrics
+
+These metrics are collected via the kubelet/cAdvisor interface (or the OpenTelemetry `kubeletstats` receiver). They cover CPU, memory, network, and filesystem for the **postgres** and **gateway** containers in each DocumentDB pod.
+
+### CPU
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `container_cpu_usage_seconds_total` | Counter | Cumulative CPU time consumed in seconds |
+| `container_spec_cpu_quota` | Gauge | CPU quota (microseconds per `cpu_period`) |
+| `container_spec_cpu_period` | Gauge | CPU CFS scheduling period (microseconds) |
+
+**Common labels:** `namespace`, `pod`, `container`, `node`
+
+#### Example Queries
+
+CPU usage rate per container over 5 minutes:
+
+```promql
+rate(container_cpu_usage_seconds_total{
+  container=~"postgres|gateway",
+  pod=~".*documentdb.*"
+}[5m])
+```
+
+CPU utilization as a percentage of limit:
+
+```promql
+(rate(container_cpu_usage_seconds_total{
+  container="postgres",
+  pod=~".*documentdb.*"
+}[5m])
+/ on(pod, container)
+(container_spec_cpu_quota{
+  container="postgres",
+  pod=~".*documentdb.*"
+} / 1e5)) * 100
+```
+
+Compare gateway vs. postgres CPU across all pods:
+
+```promql
+sum by (container) (
+  rate(container_cpu_usage_seconds_total{
+    container=~"postgres|gateway",
+    pod=~".*documentdb.*"
+  }[5m])
+)
+```
+
+### Memory
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `container_memory_working_set_bytes` | Gauge | Current working set memory (bytes) |
+| `container_memory_rss` | Gauge | Resident set size (bytes) |
+| `container_memory_cache` | Gauge | Page cache memory (bytes) |
+| `container_spec_memory_limit_bytes` | Gauge | Memory limit (bytes) |
+
+**Common labels:** `namespace`, `pod`, `container`, `node`
+
+#### Example Queries
+
+Memory usage in MiB per container:
+
+```promql
+container_memory_working_set_bytes{
+  container=~"postgres|gateway",
+  pod=~".*documentdb.*"
+} / 1024 / 1024
+```
+
+Memory utilization as a percentage of limit:
+
+```promql
+(container_memory_working_set_bytes{
+  container=~"postgres|gateway",
+  pod=~".*documentdb.*"
+}
+/ container_spec_memory_limit_bytes{
+  container=~"postgres|gateway",
+  pod=~".*documentdb.*"
+}) * 100
+```
+
+Top 5 pods by memory usage:
+
+```promql
+topk(5,
+  sum by (pod) (
+    container_memory_working_set_bytes{
+      container=~"postgres|gateway",
+      pod=~".*documentdb.*"
+    }
+  )
+)
+```
+
+### Network
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `container_network_receive_bytes_total` | Counter | Bytes received |
+| `container_network_transmit_bytes_total` | Counter | Bytes transmitted |
+
+**Common labels:** `namespace`, `pod`, `interface`
+
+#### Example Queries
+
+Network throughput (bytes/sec) per pod:
+
+```promql
+sum by (pod) (
+  rate(container_network_receive_bytes_total{
+    pod=~".*documentdb.*"
+  }[5m])
+  + rate(container_network_transmit_bytes_total{
+    pod=~".*documentdb.*"
+  }[5m])
+)
+```
+
+### Filesystem
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `container_fs_usage_bytes` | Gauge | Filesystem usage (bytes) |
+| `container_fs_reads_bytes_total` | Counter | Filesystem read bytes |
+| `container_fs_writes_bytes_total` | Counter | Filesystem write bytes |
+
+**Common labels:** `namespace`, `pod`, `container`, `device`
+
+#### Example Queries
+
+Disk I/O rate for the postgres container:
+
+```promql
+rate(container_fs_writes_bytes_total{
+  container="postgres",
+  pod=~".*documentdb.*"
+}[5m])
+```
+
+## Operator Metrics (controller-runtime)
+
+The DocumentDB operator binary exposes standard controller-runtime metrics on its metrics endpoint. These track reconciliation performance and work queue health.
+
+### Reconciliation
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `controller_runtime_reconcile_total` | Counter | Total reconciliations |
+| `controller_runtime_reconcile_errors_total` | Counter | Total reconciliation errors |
+| `controller_runtime_reconcile_time_seconds` | Histogram | Time spent in reconciliation |
+
+**Common labels:** `controller` (e.g., `documentdb`, `backup`, `scheduledbackup`, `certificate`, `persistentvolume`), `result` (`success`, `error`, `requeue`, `requeue_after`)
+
+#### Example Queries
+
+Reconciliation error rate by controller:
+
+```promql
+sum by (controller) (
+  rate(controller_runtime_reconcile_errors_total[5m])
+)
+```
+
+P95 reconciliation latency for the DocumentDB controller:
+
+```promql
+histogram_quantile(0.95,
+  sum by (le) (
+    rate(controller_runtime_reconcile_time_seconds_bucket{
+      controller="documentdb"
+    }[5m])
+  )
+)
+```
+
+Reconciliation throughput (reconciles/sec):
+
+```promql
+sum by (controller) (
+  rate(controller_runtime_reconcile_total[5m])
+)
+```
+
+### Work Queue
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `workqueue_depth` | Gauge | Current number of items in the queue |
+| `workqueue_adds_total` | Counter | Total items added |
+| `workqueue_queue_duration_seconds` | Histogram | Time items spend in queue before processing |
+| `workqueue_work_duration_seconds` | Histogram | Time spent processing items |
+| `workqueue_retries_total` | Counter | Total retries |
+
+**Common labels:** `name` (queue name, maps to controller name)
+
+#### Example Queries
+
+Work queue depth by controller:
+
+```promql
+workqueue_depth{name=~"documentdb|backup|scheduledbackup|certificate"}
+```
+
+Average time items spend waiting in queue:
+
+```promql
+rate(workqueue_queue_duration_seconds_sum{name="documentdb"}[5m])
+/ rate(workqueue_queue_duration_seconds_count{name="documentdb"}[5m])
+```
+
+## CNPG / PostgreSQL Metrics
+
+CloudNative-PG exposes PostgreSQL-level metrics from each managed pod. These are available when CNPG monitoring is enabled. For the full list, see the [CloudNative-PG monitoring docs](https://cloudnative-pg.io/documentation/current/monitoring/).
+
+### Replication
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `cnpg_pg_replication_lag` | Gauge | Replication lag in seconds |
+| `cnpg_pg_replication_streaming_replicas` | Gauge | Number of streaming replicas |
+
+#### Example Queries
+
+Replication lag per pod:
+
+```promql
+cnpg_pg_replication_lag{pod=~".*documentdb.*"}
+```
+
+### Connections
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `cnpg_pg_stat_activity_count` | Gauge | Active backend connections by state |
+
+#### Example Queries
+
+Active connections by state:
+
+```promql
+sum by (state) (
+  cnpg_pg_stat_activity_count{pod=~".*documentdb.*"}
+)
+```
+
+### Storage
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `cnpg_pg_database_size_bytes` | Gauge | Total database size |
+| `cnpg_pg_stat_bgwriter_buffers_checkpoint` | Counter | Buffers written during checkpoints |
+
+#### Example Queries
+
+Database size in GiB:
+
+```promql
+cnpg_pg_database_size_bytes{pod=~".*documentdb.*"} / 1024 / 1024 / 1024
+```
+
+### Cluster Health
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `cnpg_collector_up` | Gauge | 1 if the CNPG metrics collector is running |
+| `cnpg_pg_postmaster_start_time` | Gauge | PostgreSQL start timestamp |
+
+#### Example Queries
+
+Detect pods where the metrics collector is down:
+
+```promql
+cnpg_collector_up{pod=~".*documentdb.*"} == 0
+```
+
+## Gateway Metrics (Future)
+
+The DocumentDB Gateway does not currently expose application-level metrics. When implemented, expect metrics like:
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `documentdb_gateway_requests_total` | Counter | Total API requests (labels: `method`, `status`) |
+| `documentdb_gateway_request_duration_seconds` | Histogram | Request latency |
+| `documentdb_gateway_active_connections` | Gauge | Current connection count |
+| `documentdb_gateway_read_operations_total` | Counter | Read operations (labels: `database`, `collection`) |
+| `documentdb_gateway_write_operations_total` | Counter | Write operations (labels: `database`, `collection`) |
+| `documentdb_gateway_errors_total` | Counter | Error count (labels: `error_type`, `operation`) |
+
+These will be collected via Prometheus scraping (`/metrics` endpoint) or OTLP push. See the [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the planned implementation.
+
+## OpenTelemetry Metric Names
+
+When using the OpenTelemetry `kubeletstats` receiver, metric names use the OpenTelemetry naming convention instead of Prometheus-style names:
+
+| OpenTelemetry Name | Prometheus Equivalent |
+|---|---|
+| `k8s.container.cpu.time` | `container_cpu_usage_seconds_total` |
+| `k8s.container.memory.usage` | `container_memory_working_set_bytes` |
+| `k8s.container.cpu.limit` | `container_spec_cpu_quota` |
+| `k8s.container.memory.limit` | `container_spec_memory_limit_bytes` |
+| `k8s.pod.network.io` | `container_network_*_bytes_total` |
+
+When writing queries, use the naming convention matching your collection method. The telemetry playground uses the OpenTelemetry names; a direct Prometheus scrape of cAdvisor uses Prometheus names.
diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md
new file mode 100644
index 00000000..7e1422c8
--- /dev/null
+++ b/docs/operator-public-documentation/preview/monitoring/overview.md
@@ -0,0 +1,285 @@
+# Monitoring Overview
+
+This guide describes how to monitor DocumentDB clusters running on Kubernetes using OpenTelemetry, Prometheus, and Grafana.
+
+## Prerequisites
+
+- A running Kubernetes cluster with the DocumentDB operator installed
+- [Helm 3](https://helm.sh/docs/intro/install/) for deploying Prometheus and Grafana
+- [kubectl](https://kubernetes.io/docs/tasks/tools/) configured for your cluster
+- (Optional) [OpenTelemetry Operator](https://opentelemetry.io/docs/kubernetes/operator/) for managed collector deployments
+
+## Architecture
+
+A DocumentDB pod contains two containers:
+
+- **PostgreSQL container** — the DocumentDB engine (PostgreSQL with DocumentDB extensions)
+- **Gateway container** — MongoDB-compatible API sidecar
+
+The recommended monitoring stack collects infrastructure metrics from these containers and stores them for visualization and alerting.
+
+```
+┌──────────────────────────────────────────────────────┐
+│                   Grafana                             │
+│              (dashboards & alerts)                    │
+└──────────────────┬───────────────────────────────────┘
+                   │
+┌──────────────────┴───────────────────────────────────┐
+│                 Prometheus                            │
+│            (metrics storage)                         │
+└──────────────────┬───────────────────────────────────┘
+                   │ remote write
+┌──────────────────┴───────────────────────────────────┐
+│        OpenTelemetry Collector (DaemonSet)            │
+│  Receivers: kubeletstats, k8s_cluster, prometheus     │
+│  Processors: resource detection, attribute enrichment │
+│  Exporters: prometheusremotewrite                     │
+└──────────────────┬───────────────────────────────────┘
+                   │ scrape
+┌──────────────────┴───────────────────────────────────┐
+│              DocumentDB Pods                          │
+│  ┌──────────────┐  ┌──────────────┐                  │
+│  │  PostgreSQL   │  │   Gateway    │                  │
+│  │  container    │  │  container   │                  │
+│  └──────────────┘  └──────────────┘                  │
+└──────────────────────────────────────────────────────┘
+```
+
+### Why DaemonSet over sidecar
+
+The OpenTelemetry Collector runs as a **DaemonSet** (one collector per node) rather than as a sidecar per pod. This provides:
+
+- Lower resource overhead — one collector per node instead of one per pod
+- Node-level metrics visibility (CPU, memory, filesystem)
+- Simpler configuration and management
+
+For multi-tenant setups requiring per-namespace isolation, a **Deployment** per namespace is used instead. See the [telemetry playground](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) for both patterns.
+
+## Prometheus Integration
+
+### Operator Metrics
+
+The DocumentDB operator exposes a metrics endpoint via controller-runtime. By default:
+
+- **Bind address**: controlled by `--metrics-bind-address` (default `0`, disabled)
+- **Secure mode**: `--metrics-secure=true` serves via HTTPS with authn/authz
+- **Certificates**: supply `--metrics-cert-path` for custom TLS, otherwise self-signed certs are generated
+
+To enable metrics scraping, set the bind address in the operator deployment (for example, `:8443` for HTTPS or `:8080` for HTTP).
+
+### CNPG Cluster Metrics
+
+The underlying CloudNative-PG cluster exposes PostgreSQL metrics on each pod. These are collected by the OpenTelemetry Collector's Prometheus receiver via Kubernetes service discovery. Key metric sources:
+
+| Source | Method | Metrics |
+|--------|--------|---------|
+| kubelet/cAdvisor | `kubeletstats` receiver | Container CPU, memory, network, filesystem |
+| Kubernetes API | `k8s_cluster` receiver | Pod status, restart counts, resource requests/limits |
+| Application endpoints | `prometheus` receiver | Custom application metrics (when available) |
+
+### ServiceMonitor / PodMonitor
+
+If you use the Prometheus Operator, create a `ServiceMonitor` targeting the operator's metrics service:
+
+```yaml
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: documentdb-operator
+  namespace: documentdb-operator
+spec:
+  selector:
+    matchLabels:
+      app: documentdb-operator
+  endpoints:
+    - port: metrics
+      scheme: https
+      tlsConfig:
+        insecureSkipVerify: true
+```
+
+## Key Metrics
+
+### Container Resource Metrics
+
+| Metric | Description | Container |
+|--------|-------------|-----------|
+| `container_cpu_usage_seconds_total` | Cumulative CPU time consumed | postgres, gateway |
+| `container_memory_working_set_bytes` | Current memory usage | postgres, gateway |
+| `container_spec_memory_limit_bytes` | Memory limit | postgres, gateway |
+| `container_network_receive_bytes_total` | Network bytes received | pod-level |
+| `container_fs_reads_bytes_total` | Filesystem read bytes | postgres |
+
+### Controller-Runtime Metrics
+
+| Metric | Description |
+|--------|-------------|
+| `controller_runtime_reconcile_total` | Total reconciliations by controller and result |
+| `controller_runtime_reconcile_errors_total` | Total reconciliation errors |
+| `controller_runtime_reconcile_time_seconds` | Reconciliation duration histogram |
+| `workqueue_depth` | Current depth of the work queue |
+| `workqueue_adds_total` | Total items added to the work queue |
+
+### CNPG / PostgreSQL Metrics
+
+When the CNPG monitoring is enabled, additional PostgreSQL-level metrics are available:
+
+| Metric | Description |
+|--------|-------------|
+| `cnpg_collector_up` | Whether the CNPG metrics collector is running |
+| `cnpg_pg_replication_lag` | Replication lag in seconds |
+| `cnpg_pg_stat_activity_count` | Number of active connections |
+| `cnpg_pg_database_size_bytes` | Database size |
+
+For the full CNPG metrics reference, see the [CloudNative-PG monitoring documentation](https://cloudnative-pg.io/documentation/current/monitoring/).
+
+## Alerts
+
+### Recommended Alert Rules
+
+```yaml
+groups:
+  - name: documentdb.alerts
+    rules:
+      - alert: DocumentDBHighCPU
+        expr: |
+          (rate(container_cpu_usage_seconds_total{
+            container=~"postgres|gateway",
+            pod=~".*documentdb.*"
+          }[5m])
+          / on(pod, container) container_spec_cpu_quota{
+            container=~"postgres|gateway",
+            pod=~".*documentdb.*"
+          } * 1e5) > 0.8
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU on {{ $labels.pod }}/{{ $labels.container }}"
+
+      - alert: DocumentDBHighMemory
+        expr: |
+          (container_memory_working_set_bytes{
+            container=~"postgres|gateway",
+            pod=~".*documentdb.*"
+          }
+          / container_spec_memory_limit_bytes{
+            container=~"postgres|gateway",
+            pod=~".*documentdb.*"
+          }) > 0.85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory on {{ $labels.pod }}/{{ $labels.container }}"
+
+      - alert: DocumentDBPodRestarting
+        expr: |
+          increase(kube_pod_container_status_restarts_total{
+            pod=~".*documentdb.*"
+          }[1h]) > 3
+        labels:
+          severity: critical
+        annotations:
+          summary: "{{ $labels.pod }} restarted {{ $value }} times in the last hour"
+
+      - alert: DocumentDBReconcileErrors
+        expr: |
+          rate(controller_runtime_reconcile_errors_total{
+            controller="documentdb"
+          }[5m]) > 0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "DocumentDB controller has reconciliation errors"
+```
+
+### Recording Rules
+
+Pre-compute common queries with recording rules for dashboard efficiency:
+
+```yaml
+groups:
+  - name: documentdb.rules
+    rules:
+      - record: documentdb:cpu_usage_rate5m
+        expr: |
+          rate(container_cpu_usage_seconds_total{
+            container=~"postgres|gateway",
+            pod=~".*documentdb.*"
+          }[5m])
+
+      - record: documentdb:memory_usage_bytes
+        expr: |
+          container_memory_working_set_bytes{
+            container=~"postgres|gateway",
+            pod=~".*documentdb.*"
+          }
+
+      - record: documentdb:memory_utilization_percent
+        expr: |
+          (documentdb:memory_usage_bytes
+          / container_spec_memory_limit_bytes{
+            container=~"postgres|gateway",
+            pod=~".*documentdb.*"
+          }) * 100
+```
+
+## Telemetry Playground
+
+The [`documentdb-playground/telemetry/`](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) directory contains a complete reference implementation with:
+
+- Multi-tenant namespace isolation (separate Prometheus + Grafana per team)
+- OpenTelemetry Collector configurations for cAdvisor metric scraping
+- Automated Grafana dashboard provisioning scripts
+- AKS cluster setup with the OpenTelemetry Operator
+
+Run the quickstart:
+
+```bash
+cd documentdb-playground/telemetry/scripts/
+
+# One-time infrastructure setup
+./create-cluster.sh --install-all
+
+# Deploy multi-tenant DocumentDB + monitoring
+./deploy-multi-tenant-telemetry.sh
+
+# Create Grafana dashboards
+./setup-grafana-dashboards.sh sales-namespace
+
+# Access Grafana
+kubectl port-forward -n sales-namespace svc/grafana-sales 3001:3000 &
+# Open http://localhost:3001 (admin / admin123)
+```
+
+See the [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the full architecture rationale including DaemonSet vs. sidecar trade-offs, OTLP receiver plans, and future application-level metrics.
+
+## Verification
+
+After deploying the monitoring stack, confirm that metrics are flowing:
+
+```bash
+# Check that the OpenTelemetry Collector pods are running
+kubectl get pods -l app.kubernetes.io/name=opentelemetry-collector
+
+# Verify Prometheus is receiving metrics (port-forward first)
+kubectl port-forward svc/prometheus-server 9090:80 &
+curl -s 'http://localhost:9090/api/v1/query?query=up' | jq '.data.result | length'
+
+# Confirm DocumentDB container metrics are present
+curl -s 'http://localhost:9090/api/v1/query?query=container_cpu_usage_seconds_total{pod=~".*documentdb.*"}' \
+  | jq '.data.result | length'
+```
+
+If no metrics appear, check:
+
+- The collector's service account has RBAC access to the kubelet metrics API
+- Namespace label filters in the collector config match your DocumentDB namespace
+- The Prometheus remote-write endpoint is reachable from the collector
+
+## Next Steps
+
+- [Metrics Reference](metrics.md) — detailed metric descriptions and PromQL query examples
+- [CloudNative-PG Monitoring](https://cloudnative-pg.io/documentation/current/monitoring/) — upstream PostgreSQL metrics
diff --git a/mkdocs.yml b/mkdocs.yml
index efcb23f1..6dcd7e3f 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -10,6 +10,9 @@ nav:
       - Get Started: preview/index.md
       - Advanced Configuration: preview/advanced-configuration/README.md
       - Backup and Restore: preview/backup-and-restore.md
+      - Monitoring:
+          - Overview: preview/monitoring/overview.md
+          - Metrics Reference: preview/monitoring/metrics.md
       - FAQ: preview/faq.md
   - Tools:
       - Kubectl Plugin: preview/kubectl-plugin.md

From 086e5e9a590409494330bb16e6d2cd2649aa71c8 Mon Sep 17 00:00:00 2001
From: urismiley <urismiley@microsoft.com>
Date: Mon, 23 Feb 2026 03:04:51 -0500
Subject: [PATCH 2/5] fix: correct container and controller names in monitoring
 docs

Signed-off-by: urismiley <urismiley@microsoft.com>
---
 .../preview/monitoring/metrics.md             | 24 +++++++--------
 .../preview/monitoring/overview.md            | 30 +++++++++----------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/docs/operator-public-documentation/preview/monitoring/metrics.md b/docs/operator-public-documentation/preview/monitoring/metrics.md
index ae464f24..493c0c61 100644
--- a/docs/operator-public-documentation/preview/monitoring/metrics.md
+++ b/docs/operator-public-documentation/preview/monitoring/metrics.md
@@ -4,7 +4,7 @@ This page documents the key metrics available when monitoring a DocumentDB clust
 
 ## Container Resource Metrics
 
-These metrics are collected via the kubelet/cAdvisor interface (or the OpenTelemetry `kubeletstats` receiver). They cover CPU, memory, network, and filesystem for the **postgres** and **gateway** containers in each DocumentDB pod.
+These metrics are collected via the kubelet/cAdvisor interface (or the OpenTelemetry `kubeletstats` receiver). They cover CPU, memory, network, and filesystem for the **postgres** and **documentdb-gateway** containers in each DocumentDB pod.
 
 ### CPU
 
@@ -22,7 +22,7 @@ CPU usage rate per container over 5 minutes:
 
 ```promql
 rate(container_cpu_usage_seconds_total{
-  container=~"postgres|gateway",
+  container=~"postgres|documentdb-gateway",
   pod=~".*documentdb.*"
 }[5m])
 ```
@@ -46,7 +46,7 @@ Compare gateway vs. postgres CPU across all pods:
 ```promql
 sum by (container) (
   rate(container_cpu_usage_seconds_total{
-    container=~"postgres|gateway",
+    container=~"postgres|documentdb-gateway",
     pod=~".*documentdb.*"
   }[5m])
 )
@@ -69,7 +69,7 @@ Memory usage in MiB per container:
 
 ```promql
 container_memory_working_set_bytes{
-  container=~"postgres|gateway",
+  container=~"postgres|documentdb-gateway",
   pod=~".*documentdb.*"
 } / 1024 / 1024
 ```
@@ -78,11 +78,11 @@ Memory utilization as a percentage of limit:
 
 ```promql
 (container_memory_working_set_bytes{
-  container=~"postgres|gateway",
+  container=~"postgres|documentdb-gateway",
   pod=~".*documentdb.*"
 }
 / container_spec_memory_limit_bytes{
-  container=~"postgres|gateway",
+  container=~"postgres|documentdb-gateway",
   pod=~".*documentdb.*"
 }) * 100
 ```
@@ -93,7 +93,7 @@ Top 5 pods by memory usage:
 topk(5,
   sum by (pod) (
     container_memory_working_set_bytes{
-      container=~"postgres|gateway",
+      container=~"postgres|documentdb-gateway",
       pod=~".*documentdb.*"
     }
   )
@@ -157,7 +157,7 @@ The DocumentDB operator binary exposes standard controller-runtime metrics on it
 | `controller_runtime_reconcile_errors_total` | Counter | Total reconciliation errors |
 | `controller_runtime_reconcile_time_seconds` | Histogram | Time spent in reconciliation |
 
-**Common labels:** `controller` (e.g., `documentdb`, `backup`, `scheduledbackup`, `certificate`, `persistentvolume`), `result` (`success`, `error`, `requeue`, `requeue_after`)
+**Common labels:** `controller` (e.g., `documentdb-controller`, `backup`, `scheduledbackup`, `certificate-controller`, `persistentvolume`), `result` (`success`, `error`, `requeue`, `requeue_after`)
 
 #### Example Queries
 
@@ -175,7 +175,7 @@ P95 reconciliation latency for the DocumentDB controller:
 histogram_quantile(0.95,
   sum by (le) (
     rate(controller_runtime_reconcile_time_seconds_bucket{
-      controller="documentdb"
+      controller="documentdb-controller"
     }[5m])
   )
 )
@@ -206,14 +206,14 @@ sum by (controller) (
 Work queue depth by controller:
 
 ```promql
-workqueue_depth{name=~"documentdb|backup|scheduledbackup|certificate"}
+workqueue_depth{name=~"documentdb-controller|backup|scheduledbackup|certificate-controller"}
 ```
 
 Average time items spend waiting in queue:
 
 ```promql
-rate(workqueue_queue_duration_seconds_sum{name="documentdb"}[5m])
-/ rate(workqueue_queue_duration_seconds_count{name="documentdb"}[5m])
+rate(workqueue_queue_duration_seconds_sum{name="documentdb-controller"}[5m])
+/ rate(workqueue_queue_duration_seconds_count{name="documentdb-controller"}[5m])
 ```
 
 ## CNPG / PostgreSQL Metrics
diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md
index 7e1422c8..77ae54dd 100644
--- a/docs/operator-public-documentation/preview/monitoring/overview.md
+++ b/docs/operator-public-documentation/preview/monitoring/overview.md
@@ -30,7 +30,7 @@ The recommended monitoring stack collects infrastructure metrics from these cont
 └──────────────────┬───────────────────────────────────┘
                    │ remote write
 ┌──────────────────┴───────────────────────────────────┐
-│        OpenTelemetry Collector (DaemonSet)            │
+│        OpenTelemetry Collector                        │
 │  Receivers: kubeletstats, k8s_cluster, prometheus     │
 │  Processors: resource detection, attribute enrichment │
 │  Exporters: prometheusremotewrite                     │
@@ -45,15 +45,15 @@ The recommended monitoring stack collects infrastructure metrics from these cont
 └──────────────────────────────────────────────────────┘
 ```
 
-### Why DaemonSet over sidecar
+### Collector deployment modes
 
-The OpenTelemetry Collector runs as a **DaemonSet** (one collector per node) rather than as a sidecar per pod. This provides:
+The [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) recommends the OpenTelemetry Collector as a **DaemonSet** (one collector per node) for single-tenant clusters. This provides:
 
 - Lower resource overhead — one collector per node instead of one per pod
 - Node-level metrics visibility (CPU, memory, filesystem)
 - Simpler configuration and management
 
-For multi-tenant setups requiring per-namespace isolation, a **Deployment** per namespace is used instead. See the [telemetry playground](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) for both patterns.
+The [telemetry playground](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) implements a **Deployment** (one collector per namespace) instead, which is better suited for multi-tenant setups requiring per-namespace metric isolation. Choose the mode that fits your isolation requirements.
 
 ## Prometheus Integration
 
@@ -104,9 +104,9 @@ spec:
 
 | Metric | Description | Container |
 |--------|-------------|-----------|
-| `container_cpu_usage_seconds_total` | Cumulative CPU time consumed | postgres, gateway |
-| `container_memory_working_set_bytes` | Current memory usage | postgres, gateway |
-| `container_spec_memory_limit_bytes` | Memory limit | postgres, gateway |
+| `container_cpu_usage_seconds_total` | Cumulative CPU time consumed | postgres, documentdb-gateway |
+| `container_memory_working_set_bytes` | Current memory usage | postgres, documentdb-gateway |
+| `container_spec_memory_limit_bytes` | Memory limit | postgres, documentdb-gateway |
 | `container_network_receive_bytes_total` | Network bytes received | pod-level |
 | `container_fs_reads_bytes_total` | Filesystem read bytes | postgres |
 
@@ -144,11 +144,11 @@ groups:
       - alert: DocumentDBHighCPU
         expr: |
           (rate(container_cpu_usage_seconds_total{
-            container=~"postgres|gateway",
+            container=~"postgres|documentdb-gateway",
             pod=~".*documentdb.*"
           }[5m])
           / on(pod, container) container_spec_cpu_quota{
-            container=~"postgres|gateway",
+            container=~"postgres|documentdb-gateway",
             pod=~".*documentdb.*"
           } * 1e5) > 0.8
         for: 5m
@@ -160,11 +160,11 @@ groups:
       - alert: DocumentDBHighMemory
         expr: |
           (container_memory_working_set_bytes{
-            container=~"postgres|gateway",
+            container=~"postgres|documentdb-gateway",
             pod=~".*documentdb.*"
           }
           / container_spec_memory_limit_bytes{
-            container=~"postgres|gateway",
+            container=~"postgres|documentdb-gateway",
             pod=~".*documentdb.*"
           }) > 0.85
         for: 5m
@@ -186,7 +186,7 @@ groups:
       - alert: DocumentDBReconcileErrors
         expr: |
           rate(controller_runtime_reconcile_errors_total{
-            controller="documentdb"
+            controller="documentdb-controller"
           }[5m]) > 0
         for: 10m
         labels:
@@ -206,14 +206,14 @@ groups:
       - record: documentdb:cpu_usage_rate5m
         expr: |
           rate(container_cpu_usage_seconds_total{
-            container=~"postgres|gateway",
+            container=~"postgres|documentdb-gateway",
             pod=~".*documentdb.*"
           }[5m])
 
       - record: documentdb:memory_usage_bytes
         expr: |
           container_memory_working_set_bytes{
-            container=~"postgres|gateway",
+            container=~"postgres|documentdb-gateway",
             pod=~".*documentdb.*"
           }
 
@@ -221,7 +221,7 @@ groups:
         expr: |
           (documentdb:memory_usage_bytes
           / container_spec_memory_limit_bytes{
-            container=~"postgres|gateway",
+            container=~"postgres|documentdb-gateway",
             pod=~".*documentdb.*"
           }) * 100
 ```

From 56c9eaf9e2fffd781de85f99a26cf58cad71af9a Mon Sep 17 00:00:00 2001
From: udsmicrosoft <136555787+udsmicrosoft@users.noreply.github.com>
Date: Mon, 23 Feb 2026 03:17:56 -0500
Subject: [PATCH 3/5] Update
 docs/operator-public-documentation/preview/monitoring/overview.md

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: urismiley <urismiley@microsoft.com>
---
 .../preview/monitoring/overview.md                              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md
index 77ae54dd..5a14c431 100644
--- a/docs/operator-public-documentation/preview/monitoring/overview.md
+++ b/docs/operator-public-documentation/preview/monitoring/overview.md
@@ -254,7 +254,7 @@ kubectl port-forward -n sales-namespace svc/grafana-sales 3001:3000 &
 # Open http://localhost:3001 (admin / admin123)
 ```
 
-See the [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the full architecture rationale including DaemonSet vs. sidecar trade-offs, OTLP receiver plans, and future application-level metrics.
+See the [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the full architecture rationale including DaemonSet vs. sidecar trade-offs, OTLP receiver plans, and future application-level metrics.
 
 ## Verification
 

From b5468d11075205fe24fd65b25e53d4f7db9d2498 Mon Sep 17 00:00:00 2001
From: urismiley <urismiley@microsoft.com>
Date: Mon, 23 Feb 2026 03:25:13 -0500
Subject: [PATCH 4/5] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?=
 =?UTF-8?q?links,=20labels,=20CPU=20queries,=20prerequisites?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: urismiley <urismiley@microsoft.com>
---
 .../preview/monitoring/metrics.md             |  8 +++-
 .../preview/monitoring/overview.md            | 42 +++++++++++++++----
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/docs/operator-public-documentation/preview/monitoring/metrics.md b/docs/operator-public-documentation/preview/monitoring/metrics.md
index 493c0c61..95540c0c 100644
--- a/docs/operator-public-documentation/preview/monitoring/metrics.md
+++ b/docs/operator-public-documentation/preview/monitoring/metrics.md
@@ -38,7 +38,11 @@ CPU utilization as a percentage of limit:
 (container_spec_cpu_quota{
   container="postgres",
   pod=~".*documentdb.*"
-} / 1e5)) * 100
+}
+/ container_spec_cpu_period{
+  container="postgres",
+  pod=~".*documentdb.*"
+})) * 100
 ```
 
 Compare gateway vs. postgres CPU across all pods:
@@ -294,7 +298,7 @@ The DocumentDB Gateway does not currently expose application-level metrics. When
 | `documentdb_gateway_write_operations_total` | Counter | Write operations (labels: `database`, `collection`) |
 | `documentdb_gateway_errors_total` | Counter | Error count (labels: `error_type`, `operation`) |
 
-These will be collected via Prometheus scraping (`/metrics` endpoint) or OTLP push. See the [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the planned implementation.
+These will be collected via Prometheus scraping (`/metrics` endpoint) or OTLP push. See the [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the planned implementation.
 
 ## OpenTelemetry Metric Names
 
diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md
index 5a14c431..c3d14b93 100644
--- a/docs/operator-public-documentation/preview/monitoring/overview.md
+++ b/docs/operator-public-documentation/preview/monitoring/overview.md
@@ -7,6 +7,7 @@ This guide describes how to monitor DocumentDB clusters running on Kubernetes us
 - A running Kubernetes cluster with the DocumentDB operator installed
 - [Helm 3](https://helm.sh/docs/intro/install/) for deploying Prometheus and Grafana
 - [kubectl](https://kubernetes.io/docs/tasks/tools/) configured for your cluster
+- [`jq`](https://jqlang.github.io/jq/) for processing JSON in verification commands
 - (Optional) [OpenTelemetry Operator](https://opentelemetry.io/docs/kubernetes/operator/) for managed collector deployments
 
 ## Architecture
@@ -47,13 +48,13 @@ The recommended monitoring stack collects infrastructure metrics from these cont
 
 ### Collector deployment modes
 
-The [telemetry design document](https://github.com/microsoft/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) recommends the OpenTelemetry Collector as a **DaemonSet** (one collector per node) for single-tenant clusters. This provides:
+The [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) recommends the OpenTelemetry Collector as a **DaemonSet** (one collector per node) for single-tenant clusters. This provides:
 
 - Lower resource overhead — one collector per node instead of one per pod
 - Node-level metrics visibility (CPU, memory, filesystem)
 - Simpler configuration and management
 
-The [telemetry playground](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) implements a **Deployment** (one collector per namespace) instead, which is better suited for multi-tenant setups requiring per-namespace metric isolation. Choose the mode that fits your isolation requirements.
+The [telemetry playground](https://github.com/documentdb/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) implements a **Deployment** (one collector per namespace) instead, which is better suited for multi-tenant setups requiring per-namespace metric isolation. Choose the mode that fits your isolation requirements.
 
 ## Prometheus Integration
 
@@ -79,9 +80,24 @@ The underlying CloudNative-PG cluster exposes PostgreSQL metrics on each pod. Th
 
 ### ServiceMonitor / PodMonitor
 
-If you use the Prometheus Operator, create a `ServiceMonitor` targeting the operator's metrics service:
+The operator does not ship a metrics `Service` or `ServiceMonitor` by default. If you use the Prometheus Operator and want to scrape controller-runtime metrics, create a `Service` and `ServiceMonitor` matching your deployment. For example, with a Helm release named `documentdb`:
 
 ```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: documentdb-operator-metrics
+  namespace: documentdb-operator
+  labels:
+    app: documentdb
+spec:
+  selector:
+    app: documentdb            # must match your Helm release name
+  ports:
+    - name: metrics
+      port: 8443
+      targetPort: 8443
+---
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
@@ -90,14 +106,17 @@ metadata:
 spec:
   selector:
     matchLabels:
-      app: documentdb-operator
+      app: documentdb          # must match the Service labels above
   endpoints:
     - port: metrics
       scheme: https
       tlsConfig:
-        insecureSkipVerify: true
+        insecureSkipVerify: true   # use a proper CA bundle in production
 ```
 
+!!! note
+    Adjust the `app` label to match your Helm release name. The operator must be started with `--metrics-bind-address=:8443` for the endpoint to be available.
+
 ## Key Metrics
 
 ### Container Resource Metrics
@@ -147,10 +166,15 @@ groups:
             container=~"postgres|documentdb-gateway",
             pod=~".*documentdb.*"
           }[5m])
-          / on(pod, container) container_spec_cpu_quota{
+          / on(pod, container)
+          (container_spec_cpu_quota{
+            container=~"postgres|documentdb-gateway",
+            pod=~".*documentdb.*"
+          }
+          / container_spec_cpu_period{
             container=~"postgres|documentdb-gateway",
             pod=~".*documentdb.*"
-          } * 1e5) > 0.8
+          })) > 0.8
         for: 5m
         labels:
           severity: warning
@@ -228,7 +252,7 @@ groups:
 
 ## Telemetry Playground
 
-The [`documentdb-playground/telemetry/`](https://github.com/microsoft/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) directory contains a complete reference implementation with:
+The [`documentdb-playground/telemetry/`](https://github.com/documentdb/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) directory contains a complete reference implementation with:
 
 - Multi-tenant namespace isolation (separate Prometheus + Grafana per team)
 - OpenTelemetry Collector configurations for cAdvisor metric scraping
@@ -251,7 +275,7 @@ cd documentdb-playground/telemetry/scripts/
 
 # Access Grafana
 kubectl port-forward -n sales-namespace svc/grafana-sales 3001:3000 &
-# Open http://localhost:3001 (admin / admin123)
+# Open http://localhost:3001 (playground default: admin / admin123 — change in production)
 ```
 
 See the [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the full architecture rationale including DaemonSet vs. sidecar trade-offs, OTLP receiver plans, and future application-level metrics.

From 28fa4726daa7e9429872caf3a81ffc0922d29df0 Mon Sep 17 00:00:00 2001
From: urismiley <urismiley@microsoft.com>
Date: Tue, 24 Feb 2026 12:57:55 -0500
Subject: [PATCH 5/5] docs: remove Alerts section from monitoring overview

Signed-off-by: urismiley <urismiley@microsoft.com>
---
 .../preview/monitoring/overview.md            | 98 -------------------
 1 file changed, 98 deletions(-)

diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md
index c3d14b93..6a67361b 100644
--- a/docs/operator-public-documentation/preview/monitoring/overview.md
+++ b/docs/operator-public-documentation/preview/monitoring/overview.md
@@ -152,104 +152,6 @@ When the CNPG monitoring is enabled, additional PostgreSQL-level metrics are ava
 
 For the full CNPG metrics reference, see the [CloudNative-PG monitoring documentation](https://cloudnative-pg.io/documentation/current/monitoring/).
 
-## Alerts
-
-### Recommended Alert Rules
-
-```yaml
-groups:
-  - name: documentdb.alerts
-    rules:
-      - alert: DocumentDBHighCPU
-        expr: |
-          (rate(container_cpu_usage_seconds_total{
-            container=~"postgres|documentdb-gateway",
-            pod=~".*documentdb.*"
-          }[5m])
-          / on(pod, container)
-          (container_spec_cpu_quota{
-            container=~"postgres|documentdb-gateway",
-            pod=~".*documentdb.*"
-          }
-          / container_spec_cpu_period{
-            container=~"postgres|documentdb-gateway",
-            pod=~".*documentdb.*"
-          })) > 0.8
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High CPU on {{ $labels.pod }}/{{ $labels.container }}"
-
-      - alert: DocumentDBHighMemory
-        expr: |
-          (container_memory_working_set_bytes{
-            container=~"postgres|documentdb-gateway",
-            pod=~".*documentdb.*"
-          }
-          / container_spec_memory_limit_bytes{
-            container=~"postgres|documentdb-gateway",
-            pod=~".*documentdb.*"
-          }) > 0.85
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High memory on {{ $labels.pod }}/{{ $labels.container }}"
-
-      - alert: DocumentDBPodRestarting
-        expr: |
-          increase(kube_pod_container_status_restarts_total{
-            pod=~".*documentdb.*"
-          }[1h]) > 3
-        labels:
-          severity: critical
-        annotations:
-          summary: "{{ $labels.pod }} restarted {{ $value }} times in the last hour"
-
-      - alert: DocumentDBReconcileErrors
-        expr: |
-          rate(controller_runtime_reconcile_errors_total{
-            controller="documentdb-controller"
-          }[5m]) > 0
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: "DocumentDB controller has reconciliation errors"
-```
-
-### Recording Rules
-
-Pre-compute common queries with recording rules for dashboard efficiency:
-
-```yaml
-groups:
-  - name: documentdb.rules
-    rules:
-      - record: documentdb:cpu_usage_rate5m
-        expr: |
-          rate(container_cpu_usage_seconds_total{
-            container=~"postgres|documentdb-gateway",
-            pod=~".*documentdb.*"
-          }[5m])
-
-      - record: documentdb:memory_usage_bytes
-        expr: |
-          container_memory_working_set_bytes{
-            container=~"postgres|documentdb-gateway",
-            pod=~".*documentdb.*"
-          }
-
-      - record: documentdb:memory_utilization_percent
-        expr: |
-          (documentdb:memory_usage_bytes
-          / container_spec_memory_limit_bytes{
-            container=~"postgres|documentdb-gateway",
-            pod=~".*documentdb.*"
-          }) * 100
-```
-
 ## Telemetry Playground
 
 The [`documentdb-playground/telemetry/`](https://github.com/documentdb/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) directory contains a complete reference implementation with: