From f4266bf58a6c9e973a85fdcf75e00bdef2106ef4 Mon Sep 17 00:00:00 2001 From: Farah Trigui Date: Tue, 9 Jun 2026 11:22:52 +0200 Subject: [PATCH] externalizing skipper public gitops resources to fleet --- skipper/argocd/crds-application.yaml | 31 ++ skipper/argocd/defaults-application.yaml | 24 + skipper/argocd/operator-application.yaml | 31 ++ .../capabilities/applicationset.template.yaml | 81 +++ skipper/crds/capability-class-crd.yaml | 171 ++++++ skipper/crds/naftiko-capability-crd.yaml | 308 +++++++++++ skipper/defaults/capability-class-dev.yaml | 35 ++ .../defaults/capability-class-premium.yaml | 35 ++ .../defaults/capability-class-standard.yaml | 35 ++ .../dashboards/datadog-values.yaml | 30 ++ .../dashboards/ikanos-dashboard.json | 486 ++++++++++++++++++ .../dashboards/naftiko-datadog-dashboard.json | 16 + 12 files changed, 1283 insertions(+) create mode 100644 skipper/argocd/crds-application.yaml create mode 100644 skipper/argocd/defaults-application.yaml create mode 100644 skipper/argocd/operator-application.yaml create mode 100644 skipper/capabilities/applicationset.template.yaml create mode 100644 skipper/crds/capability-class-crd.yaml create mode 100644 skipper/crds/naftiko-capability-crd.yaml create mode 100644 skipper/defaults/capability-class-dev.yaml create mode 100644 skipper/defaults/capability-class-premium.yaml create mode 100644 skipper/defaults/capability-class-standard.yaml create mode 100644 skipper/observability/dashboards/datadog-values.yaml create mode 100644 skipper/observability/dashboards/ikanos-dashboard.json create mode 100644 skipper/observability/dashboards/naftiko-datadog-dashboard.json diff --git a/skipper/argocd/crds-application.yaml b/skipper/argocd/crds-application.yaml new file mode 100644 index 0000000..3128fa0 --- /dev/null +++ b/skipper/argocd/crds-application.yaml @@ -0,0 +1,31 @@ +# ============================================================================= +# ArgoCD Application — Naftiko CRDs +# ============================================================================= +# This ensures the Capability and CapabilityClass types exist in the +# Kubernetes API server before anything tries to watch or create them. +# ============================================================================= +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: naftiko-crds + namespace: argocd + labels: + app.kubernetes.io/part-of: naftiko + annotations: + argocd.argoproj.io/sync-wave: "-1" +spec: + project: default + source: + repoURL: https://github.com/naftiko/fleet.git + targetRevision: HEAD + path: skipper/crds + destination: + server: https://kubernetes.default.svc + namespace: naftiko-system + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - Replace=true \ No newline at end of file diff --git a/skipper/argocd/defaults-application.yaml b/skipper/argocd/defaults-application.yaml new file mode 100644 index 0000000..e251500 --- /dev/null +++ b/skipper/argocd/defaults-application.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: naftiko-defaults + namespace: argocd + labels: + app.kubernetes.io/part-of: naftiko + annotations: + argocd.argoproj.io/sync-wave: "0" # after crds, before operator +spec: + project: default + source: + repoURL: https://github.com/naftiko/fleet.git + targetRevision: HEAD + path: skipper/defaults + destination: + server: https://kubernetes.default.svc + namespace: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - ApplyOutOfSyncOnly=true \ No newline at end of file diff --git a/skipper/argocd/operator-application.yaml b/skipper/argocd/operator-application.yaml new file mode 100644 index 0000000..b68cc2d --- /dev/null +++ b/skipper/argocd/operator-application.yaml @@ -0,0 +1,31 @@ +# ============================================================================= +# ArgoCD Application — Naftiko Skipper +# ============================================================================= +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: naftiko-skipper + namespace: argocd + labels: + app.kubernetes.io/part-of: naftiko + annotations: + argocd.argoproj.io/sync-wave: "1" +spec: + project: default + source: + repoURL: ghcr.io/naftiko/skipper/helm + chart: naftiko-skipper + targetRevision: "1.0.0-alpha4-SNAPSHOT" + helm: + values: | + capabilityClasses: + enabled: false + destination: + server: https://kubernetes.default.svc + namespace: naftiko-system + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true \ No newline at end of file diff --git a/skipper/capabilities/applicationset.template.yaml b/skipper/capabilities/applicationset.template.yaml new file mode 100644 index 0000000..02f2458 --- /dev/null +++ b/skipper/capabilities/applicationset.template.yaml @@ -0,0 +1,81 @@ +# ============================================================================= +# ArgoCD ApplicationSet — Naftiko Capabilities +# ============================================================================= +# +# This template automatically creates one ArgoCD Application per capability +# directory in your Git repository. +# +# Usage: +# 1. Copy this file: +# cp applicationset.template.yaml my-capabilities.yaml +# +# 2. Replace the two placeholders: +# MY_CAPABILITIES → a name for this ApplicationSet +# (e.g. fleet-capabilities, shipyard-capabilities) +# REPO_URL → your capabilities Git repository URL +# +# 3. Apply to the cluster: +# kubectl apply -f my-capabilities.yaml +# +# Expected repository structure: +# +# your-repo/ +# └── capabilities/ +# ├── hello-world/ +# │ ├── configmap.yaml ← ikanos spec as ConfigMap +# │ └── capability.yaml ← Capability CR (specRef pattern) +# └── shipyard/ +# ├── configmap.yaml +# └── capability.yaml +# +# Each directory becomes one ArgoCD Application named "cap-" +# and one running Naftiko capability. +# +# Prerequisites: +# - Naftiko Skipper operator running (helm install naftiko-skipper ...) +# - ArgoCD installed and connected to your Git repository +# ============================================================================= + +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + # ── Name this ApplicationSet after your capabilities repo ───────────────── + name: MY_CAPABILITIES + namespace: argocd + labels: + app.kubernetes.io/part-of: naftiko + app.kubernetes.io/managed-by: naftiko-skipper +spec: + goTemplate: true + generators: + - git: + # ── Your capabilities Git repository ────────────────────────────── + repoURL: REPO_URL + revision: HEAD + directories: + # Use "capabilities/*" if capabilities are in a subfolder + # Use "*" if capabilities are directly at the repo root + - path: capabilities/* + template: + metadata: + # One ArgoCD Application per capability — named "cap-" + name: 'cap-{{.path.basename}}' + labels: + app.kubernetes.io/part-of: naftiko + naftiko.io/capability: '{{.path.basename}}' + spec: + project: default + source: + repoURL: REPO_URL + targetRevision: HEAD + path: '{{.path.path}}' + destination: + server: https://kubernetes.default.svc + namespace: default + syncPolicy: + automated: + prune: true # removes capability when directory is deleted from Git + selfHeal: true # restores capability if manually modified in cluster + syncOptions: + - ApplyOutOfSyncOnly=true + - CreateNamespace=false \ No newline at end of file diff --git a/skipper/crds/capability-class-crd.yaml b/skipper/crds/capability-class-crd.yaml new file mode 100644 index 0000000..6a5b02b --- /dev/null +++ b/skipper/crds/capability-class-crd.yaml @@ -0,0 +1,171 @@ +# ============================================================================= +# CapabilityClass CRD — naftiko.io/v1alpha1/CapabilityClass +# ============================================================================= +# Cluster-scoped resource that defines resource tiers for capabilities. +# The operator uses info.labels["naftiko.io/tier"] from each Capability CR +# to select the matching CapabilityClass and derive: +# - Pod resource requests/limits +# - HPA autoscaling parameters +# - Resilience4j defaults (circuit breaker, retry, bulkhead, rate limiter) +# +# SREs manage blast radius at the class level without touching individual +# capability specs. +# ============================================================================= +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: capabilityclasses.naftiko.io + annotations: + argocd.argoproj.io/sync-wave: "-1" +spec: + group: naftiko.io + names: + kind: CapabilityClass + listKind: CapabilityClassList + plural: capabilityclasses + singular: capabilityclass + shortNames: + - capclass + categories: + - naftiko + scope: Cluster + versions: + - name: v1alpha3 + served: true + storage: true + additionalPrinterColumns: + - name: Memory Request + type: string + jsonPath: .spec.resources.requests.memory + - name: CPU Request + type: string + jsonPath: .spec.resources.requests.cpu + - name: Min Replicas + type: integer + jsonPath: .spec.hpa.minReplicas + - name: Max Replicas + type: integer + jsonPath: .spec.hpa.maxReplicas + - name: Age + type: date + jsonPath: .metadata.creationTimestamp + schema: + openAPIV3Schema: + type: object + description: > + A CapabilityClass defines resource and resilience defaults for a tier of Naftiko capabilities. Capabilities select their class via info.labels["naftiko.io/tier"]. The operator uses the class to set resource requests/limits, HPA parameters, and Resilience4j configuration on generated Deployments. + properties: + spec: + type: object + description: Class specification + properties: + # ============================================================ + # Pod resource requests and limits + # ============================================================ + resources: + type: object + description: Resource requirements for the capability pod + properties: + requests: + type: object + properties: + memory: + type: string + description: "Memory request (e.g. 256Mi)" + cpu: + type: string + description: "CPU request (e.g. 250m)" + limits: + type: object + properties: + memory: + type: string + description: "Memory limit (e.g. 512Mi)" + cpu: + type: string + description: "CPU limit (e.g. 500m)" + + # ============================================================ + # Horizontal Pod Autoscaler parameters + # ============================================================ + hpa: + type: object + description: HPA configuration + properties: + minReplicas: + type: integer + minimum: 1 + description: Minimum number of replicas + maxReplicas: + type: integer + minimum: 1 + description: Maximum number of replicas + targetRequestsPerSecond: + type: integer + description: Target requests/second triggering scale-up + + # ============================================================ + # Resilience4j defaults + # Injected as environment variables into the engine container. + # Each consumed namespace gets its own independent instance. + # ============================================================ + resilience: + type: object + description: > + Resilience4j defaults for all consumed namespaces. Per-namespace overrides via consumes[].tags (e.g. best-effort vs sla-999). + properties: + circuitBreaker: + type: object + properties: + slidingWindowSize: + type: integer + description: Number of calls in the rolling window + failureRateThreshold: + type: integer + description: Failure rate percentage to open circuit + minimum: 1 + maximum: 100 + waitDurationInOpenState: + type: string + description: "Duration in open state before half-open (e.g. 30s)" + permittedCallsInHalfOpenState: + type: integer + description: Number of test calls in half-open state + retry: + type: object + properties: + maxAttempts: + type: integer + minimum: 1 + waitDuration: + type: string + description: "Wait between retries (e.g. 500ms)" + retryOnResultPredicate: + type: string + description: "Condition to retry (e.g. statusCode >= 500)" + bulkhead: + type: object + properties: + maxConcurrentCalls: + type: integer + description: Max concurrent calls per consumed namespace + maxWaitDuration: + type: string + description: "Max wait for a bulkhead slot (e.g. 100ms)" + rateLimit: + type: object + properties: + limitForPeriod: + type: integer + description: Max calls per refresh period + limitRefreshPeriod: + type: string + description: "Refresh period (e.g. 1s)" + timeoutDuration: + type: string + description: "Timeout when limit exceeded (0ms = fail-fast)" + + required: + - resources + required: + - spec diff --git a/skipper/crds/naftiko-capability-crd.yaml b/skipper/crds/naftiko-capability-crd.yaml new file mode 100644 index 0000000..bc90a6f --- /dev/null +++ b/skipper/crds/naftiko-capability-crd.yaml @@ -0,0 +1,308 @@ +# ============================================================================= +# Capability CRD — naftiko.io/v1alpha3/Capability +# ============================================================================= +# This CRD tells Kubernetes to accept a new resource type called "Capability". +# Once applied, SREs can do: kubectl get capabilities (or kubectl get cap) +# +# The .spec either: +# A) embeds the full ikanos spec YAML inline (ikanos + info + capability + binds) +# B) references a ConfigMap via specRef (recommended — decouples spec lifecycle) +# +# The Naftiko Skipper operator watches these CRs and reconciles them into +# running workloads (Deployment, Service, ConfigMap, Ingress, ServiceMonitor). +# ============================================================================= +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: capabilities.naftiko.io + annotations: + argocd.argoproj.io/sync-wave: "-1" +spec: + group: naftiko.io + names: + kind: Capability + listKind: CapabilityList + plural: capabilities + singular: capability + shortNames: + - cap + categories: + - naftiko + scope: Namespaced + versions: + - name: v1alpha3 + served: true + storage: true + subresources: + status: {} + additionalPrinterColumns: + - name: Label + type: string + description: Human-readable name of the capability + jsonPath: .spec.info.label + - name: Phase + type: string + description: Current phase of the capability + jsonPath: .status.phase + - name: Endpoint + type: string + description: Internal endpoint URL + jsonPath: .status.endpoint + - name: Age + type: date + jsonPath: .metadata.creationTimestamp + schema: + openAPIV3Schema: + type: object + description: > + A Capability represents a complete integration capability defined by the + ikanos specification. The Naftiko Skipper operator reconciles this resource + into a running workload with generated Deployment, Service, ConfigMap, + and optionally Ingress and ServiceMonitor resources. + properties: + spec: + type: object + description: The capability specification + x-kubernetes-preserve-unknown-fields: true + properties: + + # ── Spec reference (recommended) ──────────────────────────────── + # Points to a ConfigMap containing the full ikanos YAML. + # Mutually exclusive with inline spec fields (ikanos, info, capability, binds). + specRef: + type: object + description: > + Reference to a ConfigMap containing the full ikanos YAML spec. + Recommended over inline spec — decouples the spec lifecycle from + the CR lifecycle and avoids large CR objects. + properties: + configMap: + type: string + description: Name of the ConfigMap in the same namespace + key: + type: string + description: Key inside the ConfigMap + default: capability.yaml + required: + - configMap + + # ── Inline spec fields ────────────────────────────────────────── + # Used when the full ikanos spec is embedded directly in the CR. + # Not required when specRef is set. + + ikanos: + type: string + description: > + Version of the ikanos specification in use (e.g. "1.0.0-alpha4"). + Required for inline specs; not required when specRef is used. + + info: + type: object + description: Capability metadata + properties: + label: + type: string + description: Display name of the capability + description: + type: string + description: Description of the capability + tags: + type: array + description: Tags for discovery and filtering + items: + type: string + labels: + type: object + description: > + Key-value labels propagated to all generated Kubernetes resources. + Used for cost allocation, label selectors, and CapabilityClass + selection via naftiko.io/tier. + additionalProperties: + type: string + created: + type: string + modified: + type: string + stakeholders: + type: array + items: + type: object + properties: + role: + type: string + fullName: + type: string + email: + type: string + required: + - role + - fullName + required: + - label + - description + + binds: + type: array + description: > + External secret sources for variable injection into the spec. + Each bind namespace resolves to a Kubernetes Secret named + {capability-name}-bind-{parent-dir}. + items: + type: object + x-kubernetes-preserve-unknown-fields: true + + consumes: + type: array + description: Consumed API adapters declared at the top level (global scope) + items: + type: object + x-kubernetes-preserve-unknown-fields: true + + capability: + type: object + description: Capability technical configuration — exposes, consumes, aggregates + properties: + binds: + type: array + description: External secret sources (capability level) + items: + type: object + x-kubernetes-preserve-unknown-fields: true + exposes: + type: array + description: > + Exposed server adapters. Each entry gets its own named port on the + Service and Deployment. Supported types: rest, mcp, skill, control. + items: + type: object + x-kubernetes-preserve-unknown-fields: true + properties: + type: + type: string + description: > + Adapter type. Use "control" for the observability port + (/metrics, /health/live, /health/ready). + enum: + - rest + - mcp + - skill + - control + port: + type: integer + description: Listening port + namespace: + type: string + description: Unique identifier for this exposed adapter + lifecycle: + type: string + description: > + Lifecycle stage of this exposed adapter. + Maps to Backstage entity lifecycle. + enum: + - experimental + - production + - deprecated + tags: + type: array + description: > + Tags classifying this adapter. "public" triggers Ingress creation. + items: + type: string + authentication: + type: object + x-kubernetes-preserve-unknown-fields: true + consumes: + type: array + description: > + Consumed API adapters (capability level). + Entries with `import` + `from` are backed by a ConfigMap named + {capability-name}-import-{import-namespace}. + items: + type: object + x-kubernetes-preserve-unknown-fields: true + properties: + namespace: + type: string + type: + type: string + baseUri: + type: string + description: + type: string + import: + type: string + description: > + Import alias. When set, `from` must also be specified. + The operator mounts the referenced file at /data/{from}. + from: + type: string + description: > + Relative path to the imported consumes file + (e.g. ./shared/registry-consumes.yml). + Used with `import`. Replaces the legacy `location` field. + tags: + type: array + items: + type: string + authentication: + type: object + x-kubernetes-preserve-unknown-fields: true + aggregates: + type: array + description: Domain aggregates defining reusable multi-step flows + items: + type: object + x-kubernetes-preserve-unknown-fields: true + + status: + type: object + description: Observed state of the capability — written by the operator, read-only + properties: + phase: + type: string + description: High-level summary of capability state + enum: + - Pending + - Running + - Degraded + - Failed + endpoint: + type: string + description: > + Internal cluster endpoint URL of the primary (first non-control) port + (e.g. http://name.namespace.svc.cluster.local:3001) + observedGeneration: + type: integer + format: int64 + description: Last .metadata.generation the operator successfully reconciled + conditions: + type: array + description: Standard Kubernetes conditions + items: + type: object + properties: + type: + type: string + description: > + Condition type. One of: Ready, SecretsSynced, Degraded, CircuitOpen + status: + type: string + enum: + - "True" + - "False" + - "Unknown" + reason: + type: string + description: Machine-readable reason for the condition + message: + type: string + description: Human-readable message + lastTransitionTime: + type: string + format: date-time + description: Last time the condition transitioned + required: + - type + - status + required: + - spec \ No newline at end of file diff --git a/skipper/defaults/capability-class-dev.yaml b/skipper/defaults/capability-class-dev.yaml new file mode 100644 index 0000000..bbcef37 --- /dev/null +++ b/skipper/defaults/capability-class-dev.yaml @@ -0,0 +1,35 @@ +apiVersion: naftiko.io/v1alpha3 +kind: CapabilityClass +metadata: + name: dev + labels: + app.kubernetes.io/part-of: naftiko +spec: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + hpa: + minReplicas: 1 + maxReplicas: 2 + targetRequestsPerSecond: 10 + resilience: + circuitBreaker: + failureRateThreshold: 80 + slidingWindowSize: 5 + waitDurationInOpenState: 60s + permittedCallsInHalfOpenState: 2 + retry: + maxAttempts: 2 + waitDuration: 1s + retryOnResultPredicate: "statusCode >= 500" + bulkhead: + maxConcurrentCalls: 5 + maxWaitDuration: 500ms + rateLimit: + limitForPeriod: 20 + limitRefreshPeriod: 1s + timeoutDuration: 0ms \ No newline at end of file diff --git a/skipper/defaults/capability-class-premium.yaml b/skipper/defaults/capability-class-premium.yaml new file mode 100644 index 0000000..4dcb00e --- /dev/null +++ b/skipper/defaults/capability-class-premium.yaml @@ -0,0 +1,35 @@ +apiVersion: naftiko.io/v1alpha3 +kind: CapabilityClass +metadata: + name: premium + labels: + app.kubernetes.io/part-of: naftiko +spec: + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 1000m + memory: 1Gi + hpa: + minReplicas: 2 + maxReplicas: 20 + targetRequestsPerSecond: 1000 + resilience: + circuitBreaker: + failureRateThreshold: 30 + slidingWindowSize: 20 + waitDurationInOpenState: 15s + permittedCallsInHalfOpenState: 5 + retry: + maxAttempts: 5 + waitDuration: 200ms + retryOnResultPredicate: "statusCode >= 500" + bulkhead: + maxConcurrentCalls: 100 + maxWaitDuration: 50ms + rateLimit: + limitForPeriod: 1000 + limitRefreshPeriod: 1s + timeoutDuration: 0ms \ No newline at end of file diff --git a/skipper/defaults/capability-class-standard.yaml b/skipper/defaults/capability-class-standard.yaml new file mode 100644 index 0000000..01b374c --- /dev/null +++ b/skipper/defaults/capability-class-standard.yaml @@ -0,0 +1,35 @@ +apiVersion: naftiko.io/v1alpha3 +kind: CapabilityClass +metadata: + name: standard + labels: + app.kubernetes.io/part-of: naftiko +spec: + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + hpa: + minReplicas: 1 + maxReplicas: 4 + targetRequestsPerSecond: 100 + resilience: + circuitBreaker: + failureRateThreshold: 50 + slidingWindowSize: 10 + waitDurationInOpenState: 30s + permittedCallsInHalfOpenState: 3 + retry: + maxAttempts: 3 + waitDuration: 500ms + retryOnResultPredicate: "statusCode >= 500" + bulkhead: + maxConcurrentCalls: 20 + maxWaitDuration: 100ms + rateLimit: + limitForPeriod: 100 + limitRefreshPeriod: 1s + timeoutDuration: 0ms diff --git a/skipper/observability/dashboards/datadog-values.yaml b/skipper/observability/dashboards/datadog-values.yaml new file mode 100644 index 0000000..d25d184 --- /dev/null +++ b/skipper/observability/dashboards/datadog-values.yaml @@ -0,0 +1,30 @@ +datadog: + apiKeyExistingSecret: datadog-secret + site: datadoghq.eu + clusterName: localdev + kubelet: + tlsVerify: false + otlp: + receiver: + protocols: + http: + enabled: true + grpc: + enabled: true + apm: + portEnabled: true + systemProbe: + enabled: false + securityAgent: + runtime: + enabled: false + networkMonitoring: + enabled: false + processAgent: + enabled: false + env: + - name: DD_HOSTNAME + value: localdev-control-plane + +agents: + useHostNetwork: false diff --git a/skipper/observability/dashboards/ikanos-dashboard.json b/skipper/observability/dashboards/ikanos-dashboard.json new file mode 100644 index 0000000..cd48b95 --- /dev/null +++ b/skipper/observability/dashboards/ikanos-dashboard.json @@ -0,0 +1,486 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "mappings": [ + { "options": { "0": { "text": "DOWN" } }, "type": "value" }, + { "options": { "from": 1, "to": 999, "result": { "text": "UP" } }, "type": "range" } + ] + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "textMode": "value" + }, + "title": "Active Capabilities", + "type": "stat", + "targets": [ + { + "expr": "ikanos_capability_active", + "legendFormat": "{{ capability_name }}" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 5, "x": 4, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "textMode": "auto" + }, + "title": "Request Rate", + "type": "stat", + "targets": [ + { + "expr": "sum(rate(ikanos_request_total[$__rate_interval]))", + "legendFormat": "req/s" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.1 }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 5, "x": 9, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "textMode": "auto" + }, + "title": "Error Rate", + "type": "stat", + "targets": [ + { + "expr": "sum(rate(ikanos_request_errors[$__rate_interval]))", + "legendFormat": "err/s" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "red", "value": 2 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 5, "x": 14, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "textMode": "auto" + }, + "title": "P99 Latency", + "type": "stat", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(ikanos_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p99" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 5, "x": 19, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "textMode": "auto" + }, + "title": "Error Ratio", + "type": "stat", + "targets": [ + { + "expr": "sum(rate(ikanos_request_errors[$__rate_interval])) / clamp_min(sum(rate(ikanos_request_total[$__rate_interval])), 1)", + "legendFormat": "error %" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "title": "Request RED Metrics", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "pointSize": 5, + "stacking": { "mode": "none" } + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 10, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi" } + }, + "title": "Request Rate by Adapter", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(ikanos_request_total[$__rate_interval])) by (ikanos_adapter_type)", + "legendFormat": "{{ ikanos_adapter_type }}" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "stacking": { "mode": "none" } + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 11, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi" } + }, + "title": "Error Rate by Adapter", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(ikanos_request_errors[$__rate_interval])) by (ikanos_adapter_type)", + "legendFormat": "{{ ikanos_adapter_type }}" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "stacking": { "mode": "none" } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "id": 12, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi" } + }, + "title": "Request Duration (p50 / p95 / p99)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(ikanos_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(ikanos_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(ikanos_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p99" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "stacking": { "mode": "none" } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "id": 13, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi" } + }, + "title": "Request Duration by Operation (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(ikanos_request_duration_seconds_bucket[$__rate_interval])) by (le, ikanos_operation_id))", + "legendFormat": "{{ ikanos_operation_id }}" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "id": 102, + "title": "Step Execution", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "stacking": { "mode": "none" } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 23 }, + "id": 20, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi" } + }, + "title": "Step Duration by Type (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(ikanos_step_duration_seconds_bucket[$__rate_interval])) by (le, step_type))", + "legendFormat": "{{ step_type }}" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "stacking": { "mode": "none" } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 23 }, + "id": 21, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi" } + }, + "title": "Step Duration by Namespace (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(ikanos_step_duration_seconds_bucket[$__rate_interval])) by (le, ikanos_namespace))", + "legendFormat": "{{ ikanos_namespace }}" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, + "id": 103, + "title": "HTTP Client (Consumed APIs)", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "stacking": { "mode": "none" } + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }, + "id": 30, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi" } + }, + "title": "HTTP Client Request Rate by Host", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(ikanos_http_client_total[$__rate_interval])) by (server_address)", + "legendFormat": "{{ server_address }}" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "stacking": { "mode": "none" } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }, + "id": 31, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi" } + }, + "title": "HTTP Client Duration by Host (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(ikanos_http_client_duration_seconds_bucket[$__rate_interval])) by (le, server_address))", + "legendFormat": "{{ server_address }}" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "stacking": { "mode": "normal" } + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 40 }, + "id": 32, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi" } + }, + "title": "HTTP Client Requests by Status Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(ikanos_http_client_total[$__rate_interval])) by (http_response_status_code)", + "legendFormat": "{{ http_response_status_code }}" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["naftiko", "observability", "RED"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "type": "datasource" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Naftiko — RED Metrics", + "uid": "naftiko-red-metrics", + "version": 1 +} \ No newline at end of file diff --git a/skipper/observability/dashboards/naftiko-datadog-dashboard.json b/skipper/observability/dashboards/naftiko-datadog-dashboard.json new file mode 100644 index 0000000..a401014 --- /dev/null +++ b/skipper/observability/dashboards/naftiko-datadog-dashboard.json @@ -0,0 +1,16 @@ +{ + "title": "Naftiko — RED Metrics", + "description": "Naftiko engine observability — Rate, Errors, Duration", + "layout_type": "free", + "widgets": [ + {"definition":{"title":"Active Capabilities","type":"query_value","requests":[{"q":"avg:ikanos_capability_active{*}","aggregator":"last"}],"precision":0},"layout":{"x":0,"y":0,"width":2,"height":2}}, + {"definition":{"title":"Request Rate","type":"query_value","requests":[{"q":"sum:ikanos_request_total{*}.as_rate()","aggregator":"avg"}],"autoscale":true},"layout":{"x":2,"y":0,"width":2,"height":2}}, + {"definition":{"title":"Error Rate","type":"query_value","requests":[{"q":"sum:ikanos_request_errors{*}.as_rate()","aggregator":"avg"}],"autoscale":true},"layout":{"x":4,"y":0,"width":2,"height":2}}, + {"definition":{"title":"Request Rate by Adapter","type":"timeseries","requests":[{"q":"sum:ikanos_request_total{*} by {ikanos_adapter_type}.as_rate()","display_type":"line"}]},"layout":{"x":0,"y":2,"width":6,"height":3}}, + {"definition":{"title":"Request Duration (p50/p95/p99)","type":"timeseries","requests":[{"q":"p50:ikanos_request_duration_seconds{*}","display_type":"line"},{"q":"p95:ikanos_request_duration_seconds{*}","display_type":"line"},{"q":"p99:ikanos_request_duration_seconds{*}","display_type":"line"}]},"layout":{"x":6,"y":2,"width":6,"height":3}}, + {"definition":{"title":"HTTP Client Rate by Host","type":"timeseries","requests":[{"q":"sum:ikanos_http_client_total{*} by {server_address}.as_rate()","display_type":"bars"}]},"layout":{"x":0,"y":5,"width":6,"height":3}}, + {"definition":{"title":"HTTP Client Duration p95","type":"timeseries","requests":[{"q":"p95:ikanos_http_client_duration_seconds{*} by {server_address}","display_type":"line"}]},"layout":{"x":6,"y":5,"width":6,"height":3}} + ], + "template_variables":[], + "notify_list":[] +}