diff --git a/README.md b/README.md index a81b5a3..07cf7b2 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A Kubernetes controller that automatically creates and manages kuberik HealthChe ## Overview -This controller watches for DatadogMonitor resources and automatically creates corresponding kuberik HealthCheck resources when a specific annotation is present. The HealthCheck status is then continuously updated based on the DatadogMonitor's health state. +This controller watches for DatadogMonitor resources and automatically creates corresponding kuberik HealthCheck resources when a specific annotation is present. When a monitor provides Datadog deployment-gate inputs, the HealthCheck status is driven by the Datadog Deployment Gates API; otherwise it falls back to the DatadogMonitor status. ## How It Works @@ -28,11 +28,17 @@ When the annotation is present: ### 3. Status Synchronization -The controller continuously monitors the DatadogMonitor status and updates the HealthCheck accordingly: +When the monitor has both `service:` and `env:` tags (or equivalent override annotations), the controller evaluates a Datadog deployment gate and maps the gate result to the HealthCheck: -- **OK** → `Healthy` -- **Alert** → `Unhealthy` (with error timestamp) -- **Warn/NoData/Skipped/Ignored** → `Pending` +- **pass** -> `Healthy` +- **fail** -> `Unhealthy` (with error timestamp) +- **in_progress** -> `Pending` + +If deployment-gate inputs are not configured, the controller falls back to the DatadogMonitor status: + +- **OK** -> `Healthy` +- **Alert** -> `Unhealthy` (with error timestamp) +- **Warn/NoData/Skipped/Ignored** -> `Pending` ### 4. Cleanup @@ -50,11 +56,18 @@ metadata: namespace: production annotations: kuberik.com/health-check: "true" + # Optional deployment-gate overrides: + # kuberik.com/datadog-gate-identifier: "canary" + # kuberik.com/datadog-gate-version: "1.2.3" + # kuberik.com/datadog-gate-apm-primary-tag: "team:platform" spec: name: "High CPU Alert" type: "metric alert" query: "avg(last_5m):avg:system.cpu.user{*} > 80" message: "CPU usage is high" + tags: + - "env:production" + - "service:payments" ``` This will automatically create a HealthCheck named `datadog-check-my-monitor` in the `production` namespace. @@ -83,6 +96,18 @@ The system consists of two main controllers: The controller uses the following annotation key: - `kuberik.com/health-check`: Set to `"true"` to enable HealthCheck creation +Optional deployment-gate annotations on the DatadogMonitor: +- `kuberik.com/datadog-gate-service`: Override the Datadog service instead of reading the `service:` tag +- `kuberik.com/datadog-gate-env`: Override the Datadog environment instead of reading the `env:` tag +- `kuberik.com/datadog-gate-identifier`: Override the gate identifier (defaults to `default`) +- `kuberik.com/datadog-gate-version`: Version forwarded to the Datadog gate evaluation request +- `kuberik.com/datadog-gate-apm-primary-tag`: APM primary tag forwarded to the Datadog gate evaluation request + +To enable deployment-gate evaluations, set Datadog credentials in the controller environment: +- `DATADOG_API_KEY` or `DD_API_KEY` +- `DATADOG_APP_KEY` or `DD_APP_KEY` +- `DATADOG_SITE` or `DD_SITE` (optional, defaults to `datadoghq.com`) + ## Monitoring The controller logs all operations including: diff --git a/config/samples/datadogmonitor-sample.yaml b/config/samples/datadogmonitor-sample.yaml index 351b8bb..157ab98 100644 --- a/config/samples/datadogmonitor-sample.yaml +++ b/config/samples/datadogmonitor-sample.yaml @@ -5,6 +5,9 @@ metadata: namespace: default annotations: kuberik.com/health-check: "true" + # kuberik.com/datadog-gate-identifier: "canary" + # kuberik.com/datadog-gate-version: "1.2.3" + # kuberik.com/datadog-gate-apm-primary-tag: "team:platform" spec: name: "Sample Monitor" type: "metric alert" diff --git a/go.mod b/go.mod index e343b73..e867a23 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/kuberik/datadog-controller go 1.24.2 require ( + github.com/DataDog/datadog-api-client-go/v2 v2.56.0 github.com/DataDog/datadog-operator v1.8.0 github.com/kuberik/rollout-controller v0.2.1 github.com/onsi/ginkgo/v2 v2.22.0 @@ -15,7 +16,6 @@ require ( require ( cel.dev/expr v0.19.1 // indirect - github.com/DataDog/datadog-api-client-go/v2 v2.19.0 // indirect github.com/DataDog/extendeddaemonset v0.10.0-rc.4 // indirect github.com/DataDog/zstd v1.5.2 // indirect github.com/Masterminds/semver/v3 v3.1.1 // indirect diff --git a/go.sum b/go.sum index 7be376b..4ce7a20 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= -github.com/DataDog/datadog-api-client-go/v2 v2.19.0 h1:Wvz/63/q39EpVwSH1T8jVyRvPcMfEABenU7sD3dO2Lc= -github.com/DataDog/datadog-api-client-go/v2 v2.19.0/go.mod h1:oD5Lx8Li3oPRa/BSBenkn4i48z+91gwYORF/+6ph71g= +github.com/DataDog/datadog-api-client-go/v2 v2.56.0 h1:HKcfvAODmJCUw7nfbDKKqkEUgcu7CfxUPA9EFRJrHEI= +github.com/DataDog/datadog-api-client-go/v2 v2.56.0/go.mod h1:d3tOEgUd2kfsr9uuHQdY+nXrWp4uikgTgVCPdKNK30U= github.com/DataDog/datadog-operator v1.8.0 h1:5gzza6p+kwxkO0kYfKpN6c8l96xlP6bk7K5pxDEPRZg= github.com/DataDog/datadog-operator v1.8.0/go.mod h1:IaTKfjDrsmc7pcBCaKnlhS/I68GuiFIpAoM0+fh3QfQ= github.com/DataDog/extendeddaemonset v0.10.0-rc.4 h1:m88E+emuRHIqKgi7kHMd9N0S/NtruCCOISp3cjB7DNs= diff --git a/internal/controller/datadog_gates_client.go b/internal/controller/datadog_gates_client.go new file mode 100644 index 0000000..2e973f7 --- /dev/null +++ b/internal/controller/datadog_gates_client.go @@ -0,0 +1,263 @@ +package controller + +import ( + "context" + "errors" + "fmt" + "net/http" + "net/url" + "os" + "strings" + "time" + + "github.com/DataDog/datadog-api-client-go/v2/api/datadog" +) + +const ( + datadogAPIKeyEnvVar = "DATADOG_API_KEY" + datadogAltAPIKeyEnvVar = "DD_API_KEY" + datadogAppKeyEnvVar = "DATADOG_APP_KEY" + datadogAltAppKeyEnvVar = "DD_APP_KEY" + datadogSiteEnvVar = "DATADOG_SITE" + datadogAltSiteEnvVar = "DD_SITE" + defaultDatadogSite = "datadoghq.com" + defaultDatadogHTTPTimout = 10 * time.Second +) + +var errDatadogDeploymentGatesNotConfigured = errors.New("datadog deployment gates client is not configured") + +type deploymentGateClient interface { + StartEvaluation(ctx context.Context, input deploymentGateInput) (string, error) + GetEvaluation(ctx context.Context, evaluationID string) (deploymentGateEvaluation, error) +} + +type deploymentGateInput struct { + Service string + Env string + Identifier string + Version string + APMPrimaryTag string +} + +type deploymentGateEvaluation struct { + ID string + URL string + Status string + Rules []deploymentGateRuleEvaluation +} + +type deploymentGateRuleEvaluation struct { + Name string + Status string + Reason string + DryRun bool +} + +type datadogDeploymentGatesClient struct { + apiClient *datadog.APIClient + apiKeys map[string]datadog.APIKey + serverVariables map[string]string +} + +func newDatadogDeploymentGatesClientFromEnv() (deploymentGateClient, error) { + apiKey := firstNonEmpty(os.Getenv(datadogAPIKeyEnvVar), os.Getenv(datadogAltAPIKeyEnvVar)) + if apiKey == "" { + return nil, fmt.Errorf("%w: missing %s or %s", errDatadogDeploymentGatesNotConfigured, datadogAPIKeyEnvVar, datadogAltAPIKeyEnvVar) + } + + appKey := firstNonEmpty(os.Getenv(datadogAppKeyEnvVar), os.Getenv(datadogAltAppKeyEnvVar)) + if appKey == "" { + return nil, fmt.Errorf("%w: missing %s or %s", errDatadogDeploymentGatesNotConfigured, datadogAppKeyEnvVar, datadogAltAppKeyEnvVar) + } + + site := firstNonEmpty(os.Getenv(datadogSiteEnvVar), os.Getenv(datadogAltSiteEnvVar), defaultDatadogSite) + return newDatadogDeploymentGatesClient(site, apiKey, appKey, nil), nil +} + +func newDatadogDeploymentGatesClient(baseURL string, apiKey string, appKey string, httpClient *http.Client) deploymentGateClient { + if httpClient == nil { + httpClient = &http.Client{Timeout: defaultDatadogHTTPTimout} + } + + cfg := datadog.NewConfiguration() + cfg.HTTPClient = httpClient + + client := &datadogDeploymentGatesClient{ + apiClient: datadog.NewAPIClient(cfg), + apiKeys: map[string]datadog.APIKey{ + "apiKeyAuth": {Key: apiKey}, + "appKeyAuth": {Key: appKey}, + }, + } + + trimmed := strings.TrimSpace(baseURL) + if strings.HasPrefix(trimmed, "https://") || strings.HasPrefix(trimmed, "http://") { + cfg.Servers = datadog.ServerConfigurations{ + { + URL: strings.TrimRight(trimmed, "/"), + Description: "custom Datadog deployment gates server", + }, + } + return client + } + + client.serverVariables = map[string]string{ + "site": normalizeDatadogSite(trimmed), + } + + return client +} + +func (c *datadogDeploymentGatesClient) StartEvaluation(ctx context.Context, input deploymentGateInput) (string, error) { + payload := map[string]any{ + "data": map[string]any{ + "type": "deployment_gates_evaluation_request", + "attributes": map[string]any{ + "service": input.Service, + "env": input.Env, + "identifier": input.Identifier, + }, + }, + } + + attributes := payload["data"].(map[string]any)["attributes"].(map[string]any) + if input.Version != "" { + attributes["version"] = input.Version + } + if input.APMPrimaryTag != "" { + attributes["apm_primary_tag"] = input.APMPrimaryTag + } + + var response struct { + Data struct { + Attributes struct { + EvaluationID string `json:"evaluation_id"` + } `json:"attributes"` + } `json:"data"` + } + + if err := c.do(ctx, http.MethodPost, "/api/unstable/deployments/gates/evaluation", payload, &response); err != nil { + return "", err + } + + if response.Data.Attributes.EvaluationID == "" { + return "", errors.New("deployment gate evaluation response did not include an evaluation_id") + } + + return response.Data.Attributes.EvaluationID, nil +} + +func (c *datadogDeploymentGatesClient) GetEvaluation(ctx context.Context, evaluationID string) (deploymentGateEvaluation, error) { + var response struct { + Data struct { + ID string `json:"id"` + Attributes struct { + EvaluationURL string `json:"evaluation_url"` + GateStatus string `json:"gate_status"` + Rules []deploymentGateRuleEvaluation `json:"rules"` + } `json:"attributes"` + } `json:"data"` + } + + if err := c.do(ctx, http.MethodGet, "/api/unstable/deployments/gates/evaluation/"+url.PathEscape(evaluationID), nil, &response); err != nil { + return deploymentGateEvaluation{}, err + } + + return deploymentGateEvaluation{ + ID: response.Data.ID, + URL: response.Data.Attributes.EvaluationURL, + Status: response.Data.Attributes.GateStatus, + Rules: response.Data.Attributes.Rules, + }, nil +} + +func (c *datadogDeploymentGatesClient) do(ctx context.Context, method string, path string, body any, out any) error { + requestContext := c.requestContext(ctx) + baseURL, err := c.apiClient.Cfg.ServerURLWithContext(requestContext, "deploymentGatesEvaluation") + if err != nil { + return fmt.Errorf("resolve datadog deployment gates server URL: %w", err) + } + + headerParams := map[string]string{ + "Accept": "application/json", + } + if body != nil { + headerParams["Content-Type"] = "application/json" + } + + datadog.SetAuthKeys( + requestContext, + &headerParams, + [2]string{"apiKeyAuth", "DD-API-KEY"}, + [2]string{"appKeyAuth", "DD-APPLICATION-KEY"}, + ) + + req, err := c.apiClient.PrepareRequest( + requestContext, + baseURL+path, + method, + body, + headerParams, + url.Values{}, + url.Values{}, + nil, + ) + if err != nil { + return fmt.Errorf("prepare datadog deployment gates request: %w", err) + } + + resp, err := c.apiClient.CallAPI(req) + if err != nil { + return fmt.Errorf("call datadog deployment gates API: %w", err) + } + if resp == nil { + return errors.New("datadog deployment gates API returned a nil response") + } + + responseBody, err := datadog.ReadBody(resp) + if err != nil { + return fmt.Errorf("read datadog deployment gates API response: %w", err) + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("datadog deployment gates API returned %s: %s", resp.Status, strings.TrimSpace(string(responseBody))) + } + + if err := c.apiClient.Decode(out, responseBody, resp.Header.Get("Content-Type")); err != nil { + return fmt.Errorf("decode datadog deployment gates API response: %w", err) + } + + return nil +} + +func (c *datadogDeploymentGatesClient) requestContext(ctx context.Context) context.Context { + if ctx == nil { + ctx = context.Background() + } + + ctx = context.WithValue(ctx, datadog.ContextAPIKeys, c.apiKeys) + if len(c.serverVariables) > 0 { + ctx = context.WithValue(ctx, datadog.ContextServerVariables, c.serverVariables) + } + + return ctx +} + +func normalizeDatadogSite(site string) string { + trimmed := strings.TrimSpace(site) + if trimmed == "" { + return defaultDatadogSite + } + + return strings.TrimPrefix(trimmed, "api.") +} + +func firstNonEmpty(values ...string) string { + for _, value := range values { + if strings.TrimSpace(value) != "" { + return value + } + } + + return "" +} diff --git a/internal/controller/datadog_gates_client_test.go b/internal/controller/datadog_gates_client_test.go new file mode 100644 index 0000000..ea7fc93 --- /dev/null +++ b/internal/controller/datadog_gates_client_test.go @@ -0,0 +1,142 @@ +package controller + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" +) + +func TestDatadogDeploymentGatesClientStartEvaluation(t *testing.T) { + t.Helper() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/unstable/deployments/gates/evaluation" { + t.Fatalf("unexpected path: %s", r.URL.Path) + } + + if r.Method != http.MethodPost { + t.Fatalf("unexpected method: %s", r.Method) + } + + if got := r.Header.Get("DD-API-KEY"); got != "api-key" { + t.Fatalf("unexpected DD-API-KEY header: %s", got) + } + + if got := r.Header.Get("DD-APPLICATION-KEY"); got != "app-key" { + t.Fatalf("unexpected DD-APPLICATION-KEY header: %s", got) + } + + var payload map[string]any + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + t.Fatalf("decode payload: %v", err) + } + + data := payload["data"].(map[string]any) + attributes := data["attributes"].(map[string]any) + if got := attributes["service"]; got != "payments" { + t.Fatalf("unexpected service: %#v", got) + } + if got := attributes["env"]; got != "prod" { + t.Fatalf("unexpected env: %#v", got) + } + if got := attributes["identifier"]; got != "canary" { + t.Fatalf("unexpected identifier: %#v", got) + } + if got := attributes["version"]; got != "1.2.3" { + t.Fatalf("unexpected version: %#v", got) + } + if got := attributes["apm_primary_tag"]; got != "team:platform" { + t.Fatalf("unexpected apm_primary_tag: %#v", got) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"data":{"attributes":{"evaluation_id":"eval-123"}}}`)) + })) + defer server.Close() + + client := newDatadogDeploymentGatesClient(server.URL, "api-key", "app-key", server.Client()) + evaluationID, err := client.StartEvaluation(context.Background(), deploymentGateInput{ + Service: "payments", + Env: "prod", + Identifier: "canary", + Version: "1.2.3", + APMPrimaryTag: "team:platform", + }) + if err != nil { + t.Fatalf("StartEvaluation returned error: %v", err) + } + + if evaluationID != "eval-123" { + t.Fatalf("unexpected evaluation ID: %s", evaluationID) + } +} + +func TestDatadogDeploymentGatesClientGetEvaluation(t *testing.T) { + t.Helper() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/unstable/deployments/gates/evaluation/eval-123" { + t.Fatalf("unexpected path: %s", r.URL.Path) + } + + if r.Method != http.MethodGet { + t.Fatalf("unexpected method: %s", r.Method) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "data": { + "id": "eval-123", + "attributes": { + "evaluation_url": "https://app.datadoghq.com/ci/deployment-gates/eval-123", + "gate_status": "pass", + "rules": [ + { + "name": "latency", + "status": "pass", + "reason": "within threshold", + "dry_run": false + } + ] + } + } + }`)) + })) + defer server.Close() + + client := newDatadogDeploymentGatesClient(server.URL, "api-key", "app-key", server.Client()) + evaluation, err := client.GetEvaluation(context.Background(), "eval-123") + if err != nil { + t.Fatalf("GetEvaluation returned error: %v", err) + } + + if evaluation.Status != "pass" { + t.Fatalf("unexpected gate status: %s", evaluation.Status) + } + + if evaluation.URL != "https://app.datadoghq.com/ci/deployment-gates/eval-123" { + t.Fatalf("unexpected evaluation URL: %s", evaluation.URL) + } + + if len(evaluation.Rules) != 1 || evaluation.Rules[0].Name != "latency" { + t.Fatalf("unexpected rules: %#v", evaluation.Rules) + } +} + +func TestNormalizeDatadogSite(t *testing.T) { + t.Helper() + + testCases := map[string]string{ + "": "datadoghq.com", + "datadoghq.eu": "datadoghq.eu", + "api.us3.datadoghq.com": "us3.datadoghq.com", + } + + for input, expected := range testCases { + if got := normalizeDatadogSite(input); got != expected { + t.Fatalf("normalizeDatadogSite(%q) = %q, want %q", input, got, expected) + } + } +} diff --git a/internal/controller/healthcheck_controller.go b/internal/controller/healthcheck_controller.go index 0ae707e..236f9e0 100644 --- a/internal/controller/healthcheck_controller.go +++ b/internal/controller/healthcheck_controller.go @@ -18,7 +18,10 @@ package controller import ( "context" + "crypto/sha256" + "encoding/hex" "fmt" + "strings" "time" "k8s.io/apimachinery/pkg/api/errors" @@ -35,10 +38,27 @@ import ( kuberikrolloutv1alpha1 "github.com/kuberik/rollout-controller/api/v1alpha1" ) +const ( + managedByLabelKey = "kuberik.com/managed-by" + managedByDatadogController = "datadog-controller" + datadogMonitorNameAnnotationKey = "kuberik.com/datadog-monitor-name" + datadogGateServiceAnnotationKey = "kuberik.com/datadog-gate-service" + datadogGateEnvAnnotationKey = "kuberik.com/datadog-gate-env" + datadogGateIdentifierAnnotationKey = "kuberik.com/datadog-gate-identifier" + datadogGateVersionAnnotationKey = "kuberik.com/datadog-gate-version" + datadogGateAPMPrimaryTagAnnotationKey = "kuberik.com/datadog-gate-apm-primary-tag" + datadogGateEvaluationIDAnnotationKey = "kuberik.com/datadog-gate-evaluation-id" + datadogGateInputHashAnnotationKey = "kuberik.com/datadog-gate-input-hash" + defaultDatadogGateIdentifier = "default" + healthCheckRequeueInterval = 30 * time.Second + deploymentGatePollInterval = 15 * time.Second +) + // HealthCheckReconciler reconciles a HealthCheck object type HealthCheckReconciler struct { client.Client - Scheme *runtime.Scheme + Scheme *runtime.Scheme + GateClient deploymentGateClient } // +kubebuilder:rbac:groups=kuberik.com,resources=healthchecks,verbs=get;list;watch;update;patch @@ -84,17 +104,20 @@ func (r *HealthCheckReconciler) Reconcile(ctx context.Context, req ctrl.Request) log.Error(err, "Failed to update HealthCheck status") return ctrl.Result{}, err } - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + return ctrl.Result{RequeueAfter: healthCheckRequeueInterval}, nil + } + + if gateInput, ok := r.resolveDeploymentGateInput(datadogMonitor); ok { + return r.reconcileDeploymentGate(ctx, healthCheck, gateInput) } - // Update HealthCheck status based on DatadogMonitor status + // Fall back to DatadogMonitor status when deployment gate inputs are not configured. if err := r.updateHealthCheckStatusFromMonitor(ctx, healthCheck, datadogMonitor); err != nil { log.Error(err, "Failed to update HealthCheck status from monitor") return ctrl.Result{}, err } - // Requeue every 30 seconds to keep status updated - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + return ctrl.Result{RequeueAfter: healthCheckRequeueInterval}, nil } // isManagedByDatadogController checks if the HealthCheck is managed by our controller @@ -103,8 +126,8 @@ func (r *HealthCheckReconciler) isManagedByDatadogController(healthCheck *kuberi return false } - value, exists := healthCheck.Labels["kuberik.com/managed-by"] - return exists && value == "datadog-controller" + value, exists := healthCheck.Labels[managedByLabelKey] + return exists && value == managedByDatadogController } // findOwnerDatadogMonitor finds the DatadogMonitor that owns this HealthCheck @@ -130,7 +153,7 @@ func (r *HealthCheckReconciler) findOwnerDatadogMonitor(ctx context.Context, hea // Fallback: try to find by annotation if healthCheck.Annotations != nil { - if monitorName, exists := healthCheck.Annotations["kuberik.com/datadog-monitor-name"]; exists { + if monitorName, exists := healthCheck.Annotations[datadogMonitorNameAnnotationKey]; exists { datadogMonitor := &datadoghqcomv1alpha1.DatadogMonitor{} err := r.Get(ctx, types.NamespacedName{ Namespace: healthCheck.Namespace, @@ -203,6 +226,129 @@ func (r *HealthCheckReconciler) updateHealthCheckStatusFromMonitor(ctx context.C return r.updateHealthCheckStatus(ctx, healthCheck, healthStatus, lastErrorTime) } +func (r *HealthCheckReconciler) resolveDeploymentGateInput(monitor *datadoghqcomv1alpha1.DatadogMonitor) (deploymentGateInput, bool) { + if monitor == nil { + return deploymentGateInput{}, false + } + + service := firstNonEmpty( + annotationValue(monitor.Annotations, datadogGateServiceAnnotationKey), + tagValue(monitor.Spec.Tags, "service"), + ) + env := firstNonEmpty( + annotationValue(monitor.Annotations, datadogGateEnvAnnotationKey), + tagValue(monitor.Spec.Tags, "env"), + ) + + if service == "" || env == "" { + return deploymentGateInput{}, false + } + + return deploymentGateInput{ + Service: service, + Env: env, + Identifier: firstNonEmpty(annotationValue(monitor.Annotations, datadogGateIdentifierAnnotationKey), defaultDatadogGateIdentifier), + Version: annotationValue(monitor.Annotations, datadogGateVersionAnnotationKey), + APMPrimaryTag: annotationValue(monitor.Annotations, datadogGateAPMPrimaryTagAnnotationKey), + }, true +} + +func (r *HealthCheckReconciler) reconcileDeploymentGate( + ctx context.Context, + healthCheck *kuberikrolloutv1alpha1.HealthCheck, + input deploymentGateInput, +) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + gateClient, err := r.getDeploymentGateClient() + if err != nil { + logger.Info("Datadog deployment gates client is not configured, marking HealthCheck as Pending", "error", err) + if err := r.updateHealthCheckStatus(ctx, healthCheck, kuberikrolloutv1alpha1.HealthStatusPending, healthCheck.Status.LastErrorTime); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: healthCheckRequeueInterval}, nil + } + + inputHash := hashDeploymentGateInput(input) + currentHash := annotationValue(healthCheck.Annotations, datadogGateInputHashAnnotationKey) + evaluationID := annotationValue(healthCheck.Annotations, datadogGateEvaluationIDAnnotationKey) + + if evaluationID == "" || currentHash != inputHash { + evaluationID, err = gateClient.StartEvaluation(ctx, input) + if err != nil { + logger.Error(err, "Failed to start Datadog deployment gate evaluation", "service", input.Service, "env", input.Env, "identifier", input.Identifier) + if err := r.updateHealthCheckStatus(ctx, healthCheck, kuberikrolloutv1alpha1.HealthStatusPending, healthCheck.Status.LastErrorTime); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: healthCheckRequeueInterval}, nil + } + + logger.Info("Started Datadog deployment gate evaluation", + "healthCheck", healthCheck.Name, + "service", input.Service, + "env", input.Env, + "identifier", input.Identifier, + "evaluationID", evaluationID) + + if err := r.patchHealthCheckAnnotations(ctx, healthCheck, map[string]string{ + datadogGateEvaluationIDAnnotationKey: evaluationID, + datadogGateInputHashAnnotationKey: inputHash, + }); err != nil { + return ctrl.Result{}, err + } + + refreshedHealthCheck := &kuberikrolloutv1alpha1.HealthCheck{} + if err := r.Get(ctx, client.ObjectKeyFromObject(healthCheck), refreshedHealthCheck); err != nil { + return ctrl.Result{}, err + } + + if err := r.updateHealthCheckStatus(ctx, refreshedHealthCheck, kuberikrolloutv1alpha1.HealthStatusPending, refreshedHealthCheck.Status.LastErrorTime); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{RequeueAfter: deploymentGatePollInterval}, nil + } + + evaluation, err := gateClient.GetEvaluation(ctx, evaluationID) + if err != nil { + logger.Error(err, "Failed to fetch Datadog deployment gate evaluation", "evaluationID", evaluationID, "healthCheck", healthCheck.Name) + if err := r.updateHealthCheckStatus(ctx, healthCheck, kuberikrolloutv1alpha1.HealthStatusPending, healthCheck.Status.LastErrorTime); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: healthCheckRequeueInterval}, nil + } + + logger.Info("Fetched Datadog deployment gate evaluation", + "healthCheck", healthCheck.Name, + "evaluationID", evaluationID, + "gateStatus", evaluation.Status, + "ruleCount", len(evaluation.Rules)) + + switch evaluation.Status { + case "pass": + if err := r.updateHealthCheckStatus(ctx, healthCheck, kuberikrolloutv1alpha1.HealthStatusHealthy, healthCheck.Status.LastErrorTime); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + case "fail": + if err := r.updateHealthCheckStatus(ctx, healthCheck, kuberikrolloutv1alpha1.HealthStatusUnhealthy, lastErrorTimeForUnhealthy(healthCheck)); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + case "in_progress": + if err := r.updateHealthCheckStatus(ctx, healthCheck, kuberikrolloutv1alpha1.HealthStatusPending, healthCheck.Status.LastErrorTime); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: deploymentGatePollInterval}, nil + default: + logger.Info("Unknown Datadog deployment gate status, marking HealthCheck as Pending", "evaluationID", evaluationID, "gateStatus", evaluation.Status) + if err := r.updateHealthCheckStatus(ctx, healthCheck, kuberikrolloutv1alpha1.HealthStatusPending, healthCheck.Status.LastErrorTime); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: healthCheckRequeueInterval}, nil + } +} + // updateHealthCheckStatus updates the HealthCheck status func (r *HealthCheckReconciler) updateHealthCheckStatus(ctx context.Context, healthCheck *kuberikrolloutv1alpha1.HealthCheck, status kuberikrolloutv1alpha1.HealthStatus, lastErrorTime *metav1.Time) error { // Only update if status has changed @@ -219,6 +365,79 @@ func (r *HealthCheckReconciler) updateHealthCheckStatus(ctx context.Context, hea return r.Status().Update(ctx, healthCheck) } +func (r *HealthCheckReconciler) patchHealthCheckAnnotations(ctx context.Context, healthCheck *kuberikrolloutv1alpha1.HealthCheck, annotations map[string]string) error { + original := healthCheck.DeepCopy() + if healthCheck.Annotations == nil { + healthCheck.Annotations = make(map[string]string, len(annotations)) + } + + changed := false + for key, value := range annotations { + if healthCheck.Annotations[key] != value { + healthCheck.Annotations[key] = value + changed = true + } + } + + if !changed { + return nil + } + + return r.Patch(ctx, healthCheck, client.MergeFrom(original)) +} + +func (r *HealthCheckReconciler) getDeploymentGateClient() (deploymentGateClient, error) { + if r.GateClient != nil { + return r.GateClient, nil + } + + return newDatadogDeploymentGatesClientFromEnv() +} + +func lastErrorTimeForUnhealthy(healthCheck *kuberikrolloutv1alpha1.HealthCheck) *metav1.Time { + if healthCheck.Status.Status == kuberikrolloutv1alpha1.HealthStatusUnhealthy && healthCheck.Status.LastErrorTime != nil { + return healthCheck.Status.LastErrorTime + } + + now := metav1.Now() + return &now +} + +func hashDeploymentGateInput(input deploymentGateInput) string { + hash := sha256.Sum256([]byte(strings.Join([]string{ + input.Service, + input.Env, + input.Identifier, + input.Version, + input.APMPrimaryTag, + }, "\x00"))) + + return hex.EncodeToString(hash[:]) +} + +func annotationValue(annotations map[string]string, key string) string { + if annotations == nil { + return "" + } + + return strings.TrimSpace(annotations[key]) +} + +func tagValue(tags []string, key string) string { + for _, tag := range tags { + tagKey, tagValue, ok := strings.Cut(tag, ":") + if !ok { + continue + } + + if strings.EqualFold(strings.TrimSpace(tagKey), key) { + return strings.TrimSpace(tagValue) + } + } + + return "" +} + // enqueueHealthChecksForMonitor enqueues HealthCheck objects that are owned by a given DatadogMonitor func (r *HealthCheckReconciler) enqueueHealthChecksForMonitor(ctx context.Context, obj client.Object) []reconcile.Request { datadogMonitor, ok := obj.(*datadoghqcomv1alpha1.DatadogMonitor) diff --git a/internal/controller/healthcheck_controller_test.go b/internal/controller/healthcheck_controller_test.go index c70ff6e..c2655bc 100644 --- a/internal/controller/healthcheck_controller_test.go +++ b/internal/controller/healthcheck_controller_test.go @@ -32,6 +32,31 @@ import ( kuberikrolloutv1alpha1 "github.com/kuberik/rollout-controller/api/v1alpha1" ) +type fakeDeploymentGateClient struct { + startFunc func(ctx context.Context, input deploymentGateInput) (string, error) + getFunc func(ctx context.Context, evaluationID string) (deploymentGateEvaluation, error) + startedInputs []deploymentGateInput + requestedIDs []string +} + +func (f *fakeDeploymentGateClient) StartEvaluation(ctx context.Context, input deploymentGateInput) (string, error) { + f.startedInputs = append(f.startedInputs, input) + if f.startFunc != nil { + return f.startFunc(ctx, input) + } + + return "", nil +} + +func (f *fakeDeploymentGateClient) GetEvaluation(ctx context.Context, evaluationID string) (deploymentGateEvaluation, error) { + f.requestedIDs = append(f.requestedIDs, evaluationID) + if f.getFunc != nil { + return f.getFunc(ctx, evaluationID) + } + + return deploymentGateEvaluation{}, nil +} + var _ = Describe("HealthCheck Controller", func() { Context("When reconciling a HealthCheck", func() { const resourceName = "test-healthcheck" @@ -361,6 +386,113 @@ var _ = Describe("HealthCheck Controller", func() { Expect(updatedHealthCheck.Status.LastErrorTime.Time).To(BeTemporally(">=", firstErrorTime.Time)) }) + It("should evaluate Datadog deployment gates and mark the HealthCheck Healthy on pass", func() { + ctx := context.Background() + By("configuring deployment gate inputs on the DatadogMonitor") + datadogMonitor.Spec.Tags = []string{"env:prod", "service:payments"} + datadogMonitor.Annotations = map[string]string{ + datadogGateVersionAnnotationKey: "1.2.3", + datadogGateAPMPrimaryTagAnnotationKey: "team:platform", + } + Expect(k8sClient.Update(ctx, datadogMonitor)).To(Succeed()) + + gateClient := &fakeDeploymentGateClient{ + startFunc: func(_ context.Context, input deploymentGateInput) (string, error) { + return "eval-pass", nil + }, + getFunc: func(_ context.Context, evaluationID string) (deploymentGateEvaluation, error) { + return deploymentGateEvaluation{ + ID: evaluationID, + Status: "pass", + }, nil + }, + } + + controllerReconciler := &HealthCheckReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + GateClient: gateClient, + } + + By("starting a deployment gate evaluation") + result, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(deploymentGatePollInterval)) + Expect(gateClient.startedInputs).To(HaveLen(1)) + Expect(gateClient.startedInputs[0]).To(Equal(deploymentGateInput{ + Service: "payments", + Env: "prod", + Identifier: defaultDatadogGateIdentifier, + Version: "1.2.3", + APMPrimaryTag: "team:platform", + })) + + updatedHealthCheck := &kuberikrolloutv1alpha1.HealthCheck{} + err = k8sClient.Get(ctx, typeNamespacedName, updatedHealthCheck) + Expect(err).NotTo(HaveOccurred()) + Expect(updatedHealthCheck.Status.Status).To(Equal(kuberikrolloutv1alpha1.HealthStatusPending)) + Expect(updatedHealthCheck.Annotations[datadogGateEvaluationIDAnnotationKey]).To(Equal("eval-pass")) + Expect(updatedHealthCheck.Annotations[datadogGateInputHashAnnotationKey]).NotTo(BeEmpty()) + + By("polling the cached deployment gate evaluation") + _, err = controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(gateClient.requestedIDs).To(Equal([]string{"eval-pass"})) + + err = k8sClient.Get(ctx, typeNamespacedName, updatedHealthCheck) + Expect(err).NotTo(HaveOccurred()) + Expect(updatedHealthCheck.Status.Status).To(Equal(kuberikrolloutv1alpha1.HealthStatusHealthy)) + Expect(updatedHealthCheck.Status.LastErrorTime).To(BeNil()) + }) + + It("should evaluate Datadog deployment gates and mark the HealthCheck Unhealthy on fail", func() { + ctx := context.Background() + By("configuring deployment gate inputs on the DatadogMonitor") + datadogMonitor.Spec.Tags = []string{"env:prod", "service:payments"} + Expect(k8sClient.Update(ctx, datadogMonitor)).To(Succeed()) + + gateClient := &fakeDeploymentGateClient{ + startFunc: func(_ context.Context, input deploymentGateInput) (string, error) { + return "eval-fail", nil + }, + getFunc: func(_ context.Context, evaluationID string) (deploymentGateEvaluation, error) { + return deploymentGateEvaluation{ + ID: evaluationID, + Status: "fail", + }, nil + }, + } + + controllerReconciler := &HealthCheckReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + GateClient: gateClient, + } + + By("starting a deployment gate evaluation") + _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + + By("polling the cached deployment gate evaluation") + _, err = controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(gateClient.requestedIDs).To(Equal([]string{"eval-fail"})) + + updatedHealthCheck := &kuberikrolloutv1alpha1.HealthCheck{} + err = k8sClient.Get(ctx, typeNamespacedName, updatedHealthCheck) + Expect(err).NotTo(HaveOccurred()) + Expect(updatedHealthCheck.Status.Status).To(Equal(kuberikrolloutv1alpha1.HealthStatusUnhealthy)) + Expect(updatedHealthCheck.Status.LastErrorTime).NotTo(BeNil()) + }) + It("should skip HealthChecks not managed by datadog-controller", func() { ctx := context.Background() By("creating a HealthCheck not managed by datadog-controller")