From 418ac8c67caffda718aba841019effabbc07b71d Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Tue, 16 Jun 2026 22:01:05 +0000
Subject: [PATCH 01/17] add deployment and observability scaffold
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ansible/: systemd unit, env config, rolling deploy playbook, Vector→Axiom
  log forwarding, per-region inventory files (3 USW, 2 EUW, 2 EUC, staging)
grafana/: dashboard with clone rate, cache hit rate, latency, upstream
  fetches, NVMe usage, per-repo traffic, errors. Alert rules for node
  health, NVMe usage, upstream errors, clone latency, eviction rate.

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/README.md                             |  60 ++++
 ansible/ansible.cfg                           |   3 +
 ansible/production-euc.ini                    |   9 +
 ansible/production-euw.ini                    |   9 +
 ansible/production-usw.ini                    |  10 +
 ansible/roll.yaml                             |  95 ++++++
 ansible/secrets.yml                           |   3 +
 ansible/setup.yaml                            |  61 ++++
 ansible/staging.ini                           |   8 +
 ansible/tasks/setup_axiom.yaml                |  56 ++++
 ansible/templates/smart-git-proxy-env.j2      |  10 +
 ansible/templates/smart-git-proxy.service.j2  |  23 ++
 ansible/templates/vector.yaml.j2              |  37 +++
 .../alert-rules/smart-git-proxy-alerts.yaml   |  60 ++++
 grafana/dashboards/smart-git-proxy.json       | 292 ++++++++++++++++++
 15 files changed, 736 insertions(+)
 create mode 100644 ansible/README.md
 create mode 100644 ansible/ansible.cfg
 create mode 100644 ansible/production-euc.ini
 create mode 100644 ansible/production-euw.ini
 create mode 100644 ansible/production-usw.ini
 create mode 100644 ansible/roll.yaml
 create mode 100644 ansible/secrets.yml
 create mode 100644 ansible/setup.yaml
 create mode 100644 ansible/staging.ini
 create mode 100644 ansible/tasks/setup_axiom.yaml
 create mode 100644 ansible/templates/smart-git-proxy-env.j2
 create mode 100644 ansible/templates/smart-git-proxy.service.j2
 create mode 100644 ansible/templates/vector.yaml.j2
 create mode 100644 grafana/alert-rules/smart-git-proxy-alerts.yaml
 create mode 100644 grafana/dashboards/smart-git-proxy.json

diff --git a/ansible/README.md b/ansible/README.md
new file mode 100644
index 0000000..d9c75a5
--- /dev/null
+++ b/ansible/README.md
@@ -0,0 +1,60 @@
+# Ansible Deployment
+
+Playbooks and templates for deploying smart-git-proxy to dedicated proxy nodes.
+
+## Directory Structure
+
+```
+ansible/
+  setup.yaml               # One-time setup: systemd unit, NVMe mount, Vector logging
+  roll.yaml                # Rolling deploy: build or download binary, restart
+  tasks/
+    setup_axiom.yaml        # Vector → Axiom log forwarding
+  templates/
+    smart-git-proxy.service.j2   # systemd unit
+    smart-git-proxy-env.j2       # Environment config
+    vector.yaml.j2               # Vector config for Axiom
+  production-usw.ini        # US-West inventory (3 nodes)
+  production-euw.ini        # EU-West inventory (2 nodes)
+  production-euc.ini        # EU-Central inventory (2 nodes)
+  staging.ini               # Staging inventory
+  secrets.yml               # ansible-vault encrypted secrets
+```
+
+## Usage
+
+### Initial Setup (once per node)
+
+```bash
+ansible-playbook -i production-usw.ini setup.yaml --ask-vault-pass
+```
+
+### Deploy from Branch
+
+```bash
+ansible-playbook -i production-usw.ini roll.yaml -e branch=main --ask-vault-pass
+```
+
+### Deploy from Release
+
+```bash
+ansible-playbook -i production-usw.ini roll.yaml -e release_tag=v1.0.0 --ask-vault-pass
+```
+
+## Configuration
+
+Key environment variables (set in `templates/smart-git-proxy-env.j2`):
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LISTEN_ADDR` | `:8080` | HTTP listen address |
+| `MIRROR_DIR` | `/mnt/nvme/mirrors` | Path for bare git mirrors |
+| `MIRROR_MAX_SIZE` | `80%` | LRU eviction threshold |
+| `SYNC_STALE_AFTER` | `2s` | Upstream sync staleness window |
+| `AUTH_MODE` | `pass-through` | Forward client's GitHub token upstream |
+
+## Secrets
+
+`secrets.yml` must contain (ansible-vault encrypted):
+- `github_token` — GitHub token for cloning the repo during branch builds
+- `axiom_token` — Axiom API token for log forwarding
diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg
new file mode 100644
index 0000000..6d2dcd6
--- /dev/null
+++ b/ansible/ansible.cfg
@@ -0,0 +1,3 @@
+[defaults]
+host_key_checking = False
+timeout = 30
diff --git a/ansible/production-euc.ini b/ansible/production-euc.ini
new file mode 100644
index 0000000..5d7ee50
--- /dev/null
+++ b/ansible/production-euc.ini
@@ -0,0 +1,9 @@
+# EU-Central proxy nodes (2 nodes)
+# Update with actual IPs after provisioning.
+[all:vars]
+region=eu-central
+env=production
+
+[proxy]
+# proxy-euc-1 ansible_host=<IP>
+# proxy-euc-2 ansible_host=<IP>
diff --git a/ansible/production-euw.ini b/ansible/production-euw.ini
new file mode 100644
index 0000000..dde0031
--- /dev/null
+++ b/ansible/production-euw.ini
@@ -0,0 +1,9 @@
+# EU-West proxy nodes (2 nodes)
+# Update with actual IPs after provisioning.
+[all:vars]
+region=eu-west
+env=production
+
+[proxy]
+# proxy-euw-1 ansible_host=<IP>
+# proxy-euw-2 ansible_host=<IP>
diff --git a/ansible/production-usw.ini b/ansible/production-usw.ini
new file mode 100644
index 0000000..c6bf394
--- /dev/null
+++ b/ansible/production-usw.ini
@@ -0,0 +1,10 @@
+# US-West proxy nodes (3 nodes)
+# Update with actual IPs after provisioning.
+[all:vars]
+region=us-west
+env=production
+
+[proxy]
+# proxy-usw-1 ansible_host=<IP>
+# proxy-usw-2 ansible_host=<IP>
+# proxy-usw-3 ansible_host=<IP>
diff --git a/ansible/roll.yaml b/ansible/roll.yaml
new file mode 100644
index 0000000..2f328be
--- /dev/null
+++ b/ansible/roll.yaml
@@ -0,0 +1,95 @@
+---
+# Rolling deploy: build from branch (or download release), install binary, restart.
+# Usage:
+#   ansible-playbook -i production-usw.ini roll.yaml
+#   ansible-playbook -i production-usw.ini roll.yaml -e branch=feat/my-change
+
+- name: Roll Smart Git Proxy
+  hosts: all
+  become: yes
+  serial: 1
+  vars_files:
+    - secrets.yml
+  vars:
+    branch: ""
+    release_tag: ""
+  tasks:
+    - name: Determine environment from inventory file name
+      set_fact:
+        ENV: "{{ 'production' if 'production' in inventory_file else 'staging' if 'staging' in inventory_file else '' }}"
+
+    - name: Ensure ENV is determined
+      fail:
+        msg: "Failed to determine environment from inventory file name."
+      when: ENV == ''
+
+    # --- Branch build path ---
+    - name: Install Go for branch build
+      shell: |
+        if /usr/local/go/bin/go version 2>/dev/null | grep -q 'go1.25'; then
+          echo "Go already installed"
+          exit 0
+        fi
+        wget -q https://go.dev/dl/go1.25.0.linux-amd64.tar.gz -O /tmp/go.tar.gz
+        rm -rf /usr/local/go
+        tar -C /usr/local -xzf /tmp/go.tar.gz
+        rm /tmp/go.tar.gz
+      when: branch != ""
+
+    - name: Clone repo at branch
+      git:
+        repo: "https://{{ github_token }}@github.com/useblacksmith/smart-git-proxy.git"
+        dest: /tmp/smart-git-proxy-build
+        version: "{{ branch }}"
+        force: yes
+      when: branch != ""
+
+    - name: Build from branch
+      shell: |
+        set -euo pipefail
+        export PATH=/usr/local/go/bin:$PATH
+        cd /tmp/smart-git-proxy-build
+        make build
+      args:
+        executable: /bin/bash
+      when: branch != ""
+
+    - name: Install branch binary
+      copy:
+        src: /tmp/smart-git-proxy-build/bin/smart-git-proxy
+        dest: /usr/local/bin/smart-git-proxy
+        mode: "0755"
+        remote_src: yes
+      when: branch != ""
+
+    - name: Clean up build directory
+      file:
+        path: /tmp/smart-git-proxy-build
+        state: absent
+      when: branch != ""
+
+    # --- Release download path ---
+    - name: Download release binary
+      get_url:
+        url: "https://github.com/useblacksmith/smart-git-proxy/releases/download/{{ release_tag }}/smart-git-proxy_linux_amd64"
+        dest: /usr/local/bin/smart-git-proxy
+        mode: "0755"
+        force: yes
+      when: branch == "" and release_tag != ""
+
+    # --- Restart ---
+    - name: Restart smart-git-proxy
+      systemd:
+        name: smart-git-proxy
+        state: restarted
+        enabled: yes
+
+    - name: Wait for health check
+      uri:
+        url: http://localhost:8080/healthz
+        status_code: 200
+        timeout: 5
+      register: health
+      retries: 10
+      delay: 2
+      until: health.status == 200
diff --git a/ansible/secrets.yml b/ansible/secrets.yml
new file mode 100644
index 0000000..e1ed311
--- /dev/null
+++ b/ansible/secrets.yml
@@ -0,0 +1,3 @@
+# Encrypted with ansible-vault.
+# Contains: github_token, axiom_token
+# To edit: ansible-vault edit secrets.yml
diff --git a/ansible/setup.yaml b/ansible/setup.yaml
new file mode 100644
index 0000000..890788f
--- /dev/null
+++ b/ansible/setup.yaml
@@ -0,0 +1,61 @@
+---
+# One-time setup: deploy systemd unit, env config, NVMe mount, Vector logging.
+# Does not start the service -- the roll playbook handles that.
+
+- name: Setup Smart Git Proxy
+  hosts: all
+  become: yes
+  vars_files:
+    - secrets.yml
+  tasks:
+    - name: Determine environment from inventory file name
+      set_fact:
+        ENV: "{{ 'production' if 'production' in inventory_file else 'staging' if 'staging' in inventory_file else '' }}"
+
+    - name: Ensure ENV is determined
+      fail:
+        msg: "Failed to determine environment from inventory file name."
+      when: ENV == ''
+
+    # --- NVMe storage ---
+    - name: Check if NVMe mirror directory exists
+      stat:
+        path: /mnt/nvme
+      register: nvme_mount
+
+    - name: Create mirror directory
+      file:
+        path: "{{ mirror_dir | default('/mnt/nvme/mirrors') }}"
+        state: directory
+        mode: "0755"
+      when: nvme_mount.stat.exists
+
+    # --- Config ---
+    - name: Ensure config directory exists
+      file:
+        path: /etc/smart-git-proxy
+        state: directory
+        mode: "0755"
+
+    - name: Deploy environment config
+      template:
+        src: templates/smart-git-proxy-env.j2
+        dest: /etc/smart-git-proxy/env
+        mode: "0600"
+
+    # --- systemd ---
+    - name: Deploy systemd unit
+      template:
+        src: templates/smart-git-proxy.service.j2
+        dest: /etc/systemd/system/smart-git-proxy.service
+        mode: "0644"
+      register: systemd_unit
+
+    - name: Reload systemd daemon
+      systemd:
+        daemon_reload: yes
+      when: systemd_unit.changed
+
+    # --- Axiom log forwarding ---
+    - name: Setup Axiom log forwarding
+      include_tasks: tasks/setup_axiom.yaml
diff --git a/ansible/staging.ini b/ansible/staging.ini
new file mode 100644
index 0000000..b5eb016
--- /dev/null
+++ b/ansible/staging.ini
@@ -0,0 +1,8 @@
+# Staging proxy nodes (1 node per region or shared)
+# Update with actual IPs after provisioning.
+[all:vars]
+region=us-west
+env=staging
+
+[proxy]
+# proxy-staging-1 ansible_host=<IP>
diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml
new file mode 100644
index 0000000..4d98135
--- /dev/null
+++ b/ansible/tasks/setup_axiom.yaml
@@ -0,0 +1,56 @@
+---
+- name: Add Vector repository
+  ansible.builtin.shell: |
+    bash -c "$(curl -L https://setup.vector.dev)"
+  args:
+    creates: /usr/share/keyrings/vector-archive-keyring.gpg
+
+- name: Install Vector
+  ansible.builtin.apt:
+    name: vector
+    state: present
+
+- name: Get hostname
+  shell: hostname
+  register: actual_hostname
+  changed_when: false
+
+- name: Create systemd override directory for Vector
+  file:
+    path: /etc/systemd/system/vector.service.d
+    state: directory
+    mode: "0755"
+
+- name: Configure Vector systemd override
+  copy:
+    content: |
+      [Service]
+      Environment="HOSTNAME={{ actual_hostname.stdout }}"
+    dest: /etc/systemd/system/vector.service.d/override.conf
+    mode: "0644"
+  register: vector_systemd_override
+
+- name: Reload systemd if Vector override changed
+  systemd:
+    daemon_reload: yes
+  when: vector_systemd_override.changed
+
+- name: Deploy Vector config
+  template:
+    src: templates/vector.yaml.j2
+    dest: /etc/vector/vector.yaml
+    mode: "0644"
+  register: vector_config
+
+- name: Enable and restart Vector
+  systemd:
+    name: vector
+    state: restarted
+    enabled: yes
+  when: vector_config.changed or vector_systemd_override.changed
+
+- name: Ensure Vector is running
+  systemd:
+    name: vector
+    state: started
+    enabled: yes
diff --git a/ansible/templates/smart-git-proxy-env.j2 b/ansible/templates/smart-git-proxy-env.j2
new file mode 100644
index 0000000..457b4ee
--- /dev/null
+++ b/ansible/templates/smart-git-proxy-env.j2
@@ -0,0 +1,10 @@
+# Smart Git Proxy configuration
+# See https://github.com/useblacksmith/smart-git-proxy#configuration
+
+LISTEN_ADDR=:8080
+MIRROR_DIR={{ mirror_dir | default('/mnt/nvme/mirrors') }}
+MIRROR_MAX_SIZE={{ mirror_max_size | default('80%') }}
+SYNC_STALE_AFTER={{ sync_stale_after | default('2s') }}
+ALLOWED_UPSTREAMS=github.com
+AUTH_MODE=pass-through
+LOG_LEVEL={{ log_level | default('info') }}
diff --git a/ansible/templates/smart-git-proxy.service.j2 b/ansible/templates/smart-git-proxy.service.j2
new file mode 100644
index 0000000..dae6118
--- /dev/null
+++ b/ansible/templates/smart-git-proxy.service.j2
@@ -0,0 +1,23 @@
+[Unit]
+Description=Smart Git Proxy
+After=network-online.target
+Wants=network-online.target
+StartLimitIntervalSec=300
+StartLimitBurst=10
+
+[Service]
+Type=exec
+Environment="HOME=/root"
+EnvironmentFile=/etc/smart-git-proxy/env
+ExecStart=/usr/local/bin/smart-git-proxy
+Restart=on-failure
+RestartSec=10
+TimeoutStopSec=30
+KillMode=control-group
+KillSignal=SIGTERM
+StandardOutput=journal
+StandardError=journal
+SyslogIdentifier=smart-git-proxy
+
+[Install]
+WantedBy=multi-user.target
diff --git a/ansible/templates/vector.yaml.j2 b/ansible/templates/vector.yaml.j2
new file mode 100644
index 0000000..5b06e65
--- /dev/null
+++ b/ansible/templates/vector.yaml.j2
@@ -0,0 +1,37 @@
+# Vector config for shipping smart-git-proxy logs to Axiom.
+sources:
+  smart_git_proxy_logs:
+    type: journald
+    units:
+      - smart-git-proxy
+
+transforms:
+  add_metadata:
+    type: remap
+    inputs:
+      - smart_git_proxy_logs
+    source: |
+      if exists(.message) {
+        parsed, err = parse_json(.message)
+        if err == null {
+          . = merge(., parsed)
+        }
+      }
+
+      . = merge(., {
+        "host": "${HOSTNAME}",
+        "environment": "{{ env }}",
+        "region": "{{ region }}",
+        "service_name": "smart-git-proxy"
+      })
+
+sinks:
+  axiom:
+    type: axiom
+    inputs:
+      - add_metadata
+    token: "{{ axiom_token }}"
+    dataset: smart-git-proxy
+    batch:
+      max_bytes: 1049000
+      timeout_secs: 1
diff --git a/grafana/alert-rules/smart-git-proxy-alerts.yaml b/grafana/alert-rules/smart-git-proxy-alerts.yaml
new file mode 100644
index 0000000..d4224fb
--- /dev/null
+++ b/grafana/alert-rules/smart-git-proxy-alerts.yaml
@@ -0,0 +1,60 @@
+# Grafana alert rules for Smart Git Proxy.
+# Import into Grafana via Alerting > Alert rules > Import.
+
+groups:
+  - name: smart-git-proxy
+    interval: 1m
+    rules:
+      - alert: ProxyNodeDown
+        expr: up{job="smart-git-proxy"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Smart git proxy node {{ $labels.instance }} is down"
+          description: "Health endpoint unreachable for 2 minutes."
+
+      - alert: NVMeUsageHigh
+        expr: smart_git_proxy_disk_usage_ratio > 0.80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "NVMe usage >80% on {{ $labels.instance }}"
+          description: "Mirror storage at {{ $value | humanizePercentage }}. LRU eviction should handle this, but may indicate undersized disk or eviction failure."
+
+      - alert: NVMeUsageCritical
+        expr: smart_git_proxy_disk_usage_ratio > 0.95
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "NVMe usage >95% on {{ $labels.instance }}"
+          description: "Mirror storage nearly full. LRU eviction may be failing."
+
+      - alert: UpstreamFetchErrorsHigh
+        expr: sum(rate(smart_git_proxy_upstream_fetch_errors_total[5m])) by (instance) > 0.5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Elevated upstream fetch errors on {{ $labels.instance }}"
+          description: "Failing to fetch from GitHub at {{ $value | humanize }}/sec. May indicate GitHub rate limiting or network issues."
+
+      - alert: CloneLatencyHigh
+        expr: histogram_quantile(0.95, sum(rate(smart_git_proxy_clone_duration_seconds_bucket[5m])) by (le, instance)) > 30
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "p95 clone latency >30s on {{ $labels.instance }}"
+          description: "Slow git clone serving — may indicate NVMe I/O pressure or excessive concurrent requests."
+
+      - alert: HighEvictionRate
+        expr: sum(rate(smart_git_proxy_evictions_total[5m])) by (instance) > 1
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High LRU eviction rate on {{ $labels.instance }}"
+          description: "Evicting >1 mirror/sec sustained. May indicate insufficient NVMe capacity for the working set."
diff --git a/grafana/dashboards/smart-git-proxy.json b/grafana/dashboards/smart-git-proxy.json
new file mode 100644
index 0000000..f3c1fb9
--- /dev/null
+++ b/grafana/dashboards/smart-git-proxy.json
@@ -0,0 +1,292 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+      "id": 100,
+      "title": "Overview",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "Clone Requests / sec",
+      "description": "Rate of git clone/fetch requests served by the proxy.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
+          "legendFormat": "{{ instance }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
+        }
+      }
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "Cache Hit Rate",
+      "description": "Percentage of requests served from warm local mirror (no upstream GitHub fetch).",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "sum(rate(smart_git_proxy_cache_hits_total{instance=~\"$instance\"}[$__rate_interval])) / sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) * 100",
+          "legendFormat": "hit rate %"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
+        }
+      }
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "Clone Latency (p50 / p95 / p99)",
+      "description": "Time to serve a git clone/fetch request from local NVMe mirror.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
+          "legendFormat": "p50"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
+          "legendFormat": "p95"
+        },
+        {
+          "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
+          "legendFormat": "p99"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
+        }
+      }
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "Upstream GitHub Fetches / sec",
+      "description": "Rate of upstream fetches to GitHub (should be low if mirrors are warm).",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "sum(rate(smart_git_proxy_upstream_fetches_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
+          "legendFormat": "{{ instance }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
+        }
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 },
+      "id": 101,
+      "title": "Storage",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "Mirror Count",
+      "description": "Number of bare git mirrors currently on disk.",
+      "type": "stat",
+      "gridPos": { "h": 6, "w": 6, "x": 0, "y": 18 },
+      "id": 5,
+      "targets": [
+        {
+          "expr": "sum(smart_git_proxy_mirrors_total{instance=~\"$instance\"}) by (instance)",
+          "legendFormat": "{{ instance }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "short" }
+      }
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "NVMe Usage %",
+      "description": "Disk usage percentage of the mirror storage directory.",
+      "type": "gauge",
+      "gridPos": { "h": 6, "w": 6, "x": 6, "y": 18 },
+      "id": 6,
+      "targets": [
+        {
+          "expr": "smart_git_proxy_disk_usage_ratio{instance=~\"$instance\"} * 100",
+          "legendFormat": "{{ instance }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 60 },
+              { "color": "red", "value": 80 }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "NVMe Usage (bytes)",
+      "description": "Absolute disk usage over time for the mirror directory.",
+      "type": "timeseries",
+      "gridPos": { "h": 6, "w": 12, "x": 12, "y": 18 },
+      "id": 7,
+      "targets": [
+        {
+          "expr": "smart_git_proxy_disk_usage_bytes{instance=~\"$instance\"}",
+          "legendFormat": "{{ instance }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "bytes",
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
+        }
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
+      "id": 102,
+      "title": "Per-Repo Traffic",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "Top Repos by Request Rate",
+      "description": "Clone/fetch request rate broken down by repository.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 25 },
+      "id": 8,
+      "targets": [
+        {
+          "expr": "topk(10, sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (repo))",
+          "legendFormat": "{{ repo }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
+        }
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 },
+      "id": 103,
+      "title": "Errors",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "Upstream Fetch Errors / sec",
+      "description": "Failed upstream GitHub fetches.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 },
+      "id": 9,
+      "targets": [
+        {
+          "expr": "sum(rate(smart_git_proxy_upstream_fetch_errors_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
+          "legendFormat": "{{ instance }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
+        }
+      }
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "title": "LRU Evictions / sec",
+      "description": "Rate of mirrors evicted to free disk space.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 },
+      "id": 10,
+      "targets": [
+        {
+          "expr": "sum(rate(smart_git_proxy_evictions_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
+          "legendFormat": "{{ instance }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
+        }
+      }
+    }
+  ],
+  "templating": {
+    "list": [
+      {
+        "name": "datasource",
+        "type": "datasource",
+        "query": "prometheus",
+        "current": { "text": "Prometheus", "value": "PBFA97CFB590B2093" }
+      },
+      {
+        "name": "instance",
+        "type": "query",
+        "datasource": { "type": "prometheus", "uid": "${datasource}" },
+        "query": "label_values(smart_git_proxy_clone_requests_total, instance)",
+        "includeAll": true,
+        "allValue": ".*",
+        "multi": true,
+        "current": { "text": "All", "value": "$__all" }
+      }
+    ]
+  },
+  "time": { "from": "now-6h", "to": "now" },
+  "timepicker": {},
+  "timezone": "utc",
+  "title": "Smart Git Proxy",
+  "uid": "smart-git-proxy",
+  "version": 1,
+  "schemaVersion": 36
+}

From 7e28b4043ff6db59e95f03a65d0e53ca7b59d301 Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Tue, 16 Jun 2026 22:04:41 +0000
Subject: [PATCH 02/17] fix grafana metrics, nvme setup, and release download

- grafana: align metric names with internal/metrics/metrics.go
  (requests_total, request_seconds, sync_total, errors_total,
  responses_total) instead of invented names
- ansible/setup: fail explicitly when /mnt/nvme is missing instead
  of silently skipping mirror directory creation
- ansible/roll: download goreleaser tar.gz archive and extract,
  matching the actual release asset naming convention

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/roll.yaml                             |  19 ++-
 ansible/setup.yaml                            |   8 +-
 .../alert-rules/smart-git-proxy-alerts.yaml   |  49 +++---
 grafana/dashboards/smart-git-proxy.json       | 150 +++++-------------
 4 files changed, 78 insertions(+), 148 deletions(-)

diff --git a/ansible/roll.yaml b/ansible/roll.yaml
index 2f328be..4b38278 100644
--- a/ansible/roll.yaml
+++ b/ansible/roll.yaml
@@ -69,14 +69,25 @@
       when: branch != ""
 
     # --- Release download path ---
-    - name: Download release binary
+    # Goreleaser publishes archives as: smart-git-proxy_<version>_linux_amd64.tar.gz
+    - name: Download release archive
       get_url:
-        url: "https://github.com/useblacksmith/smart-git-proxy/releases/download/{{ release_tag }}/smart-git-proxy_linux_amd64"
-        dest: /usr/local/bin/smart-git-proxy
-        mode: "0755"
+        url: "https://github.com/useblacksmith/smart-git-proxy/releases/download/{{ release_tag }}/smart-git-proxy_{{ release_tag | regex_replace('^v', '') }}_linux_amd64.tar.gz"
+        dest: /tmp/smart-git-proxy-release.tar.gz
         force: yes
       when: branch == "" and release_tag != ""
 
+    - name: Extract release binary
+      shell: |
+        set -euo pipefail
+        mkdir -p /tmp/smart-git-proxy-release
+        tar -xzf /tmp/smart-git-proxy-release.tar.gz -C /tmp/smart-git-proxy-release
+        install -m 0755 /tmp/smart-git-proxy-release/smart-git-proxy /usr/local/bin/smart-git-proxy
+        rm -rf /tmp/smart-git-proxy-release /tmp/smart-git-proxy-release.tar.gz
+      args:
+        executable: /bin/bash
+      when: branch == "" and release_tag != ""
+
     # --- Restart ---
     - name: Restart smart-git-proxy
       systemd:
diff --git a/ansible/setup.yaml b/ansible/setup.yaml
index 890788f..e86e001 100644
--- a/ansible/setup.yaml
+++ b/ansible/setup.yaml
@@ -18,17 +18,21 @@
       when: ENV == ''
 
     # --- NVMe storage ---
-    - name: Check if NVMe mirror directory exists
+    - name: Verify NVMe mount exists
       stat:
         path: /mnt/nvme
       register: nvme_mount
 
+    - name: Fail if NVMe mount is missing
+      fail:
+        msg: "/mnt/nvme does not exist. Proxy nodes require NVMe storage for git mirrors."
+      when: not nvme_mount.stat.exists
+
     - name: Create mirror directory
       file:
         path: "{{ mirror_dir | default('/mnt/nvme/mirrors') }}"
         state: directory
         mode: "0755"
-      when: nvme_mount.stat.exists
 
     # --- Config ---
     - name: Ensure config directory exists
diff --git a/grafana/alert-rules/smart-git-proxy-alerts.yaml b/grafana/alert-rules/smart-git-proxy-alerts.yaml
index d4224fb..441e82f 100644
--- a/grafana/alert-rules/smart-git-proxy-alerts.yaml
+++ b/grafana/alert-rules/smart-git-proxy-alerts.yaml
@@ -1,5 +1,12 @@
 # Grafana alert rules for Smart Git Proxy.
 # Import into Grafana via Alerting > Alert rules > Import.
+#
+# Metric names match internal/metrics/metrics.go:
+#   smart_git_proxy_requests_total    (repo, kind, source)
+#   smart_git_proxy_responses_total   (repo, kind, status)
+#   smart_git_proxy_errors_total      (repo, kind)
+#   smart_git_proxy_request_seconds   (repo, kind) [histogram]
+#   smart_git_proxy_sync_total        (repo, result)
 
 groups:
   - name: smart-git-proxy
@@ -14,47 +21,29 @@ groups:
           summary: "Smart git proxy node {{ $labels.instance }} is down"
           description: "Health endpoint unreachable for 2 minutes."
 
-      - alert: NVMeUsageHigh
-        expr: smart_git_proxy_disk_usage_ratio > 0.80
+      - alert: ErrorRateHigh
+        expr: sum(rate(smart_git_proxy_errors_total[5m])) by (instance) > 0.5
         for: 5m
         labels:
           severity: warning
         annotations:
-          summary: "NVMe usage >80% on {{ $labels.instance }}"
-          description: "Mirror storage at {{ $value | humanizePercentage }}. LRU eviction should handle this, but may indicate undersized disk or eviction failure."
+          summary: "Elevated error rate on {{ $labels.instance }}"
+          description: "Errors at {{ $value | humanize }}/sec. May indicate upstream GitHub issues or local git failures."
 
-      - alert: NVMeUsageCritical
-        expr: smart_git_proxy_disk_usage_ratio > 0.95
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: "NVMe usage >95% on {{ $labels.instance }}"
-          description: "Mirror storage nearly full. LRU eviction may be failing."
-
-      - alert: UpstreamFetchErrorsHigh
-        expr: sum(rate(smart_git_proxy_upstream_fetch_errors_total[5m])) by (instance) > 0.5
+      - alert: RequestLatencyHigh
+        expr: histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket[5m])) by (le, instance)) > 30
         for: 5m
         labels:
           severity: warning
         annotations:
-          summary: "Elevated upstream fetch errors on {{ $labels.instance }}"
-          description: "Failing to fetch from GitHub at {{ $value | humanize }}/sec. May indicate GitHub rate limiting or network issues."
+          summary: "p95 request latency >30s on {{ $labels.instance }}"
+          description: "Slow git request serving — may indicate NVMe I/O pressure or excessive concurrent requests."
 
-      - alert: CloneLatencyHigh
-        expr: histogram_quantile(0.95, sum(rate(smart_git_proxy_clone_duration_seconds_bucket[5m])) by (le, instance)) > 30
+      - alert: SyncFailureRateHigh
+        expr: sum(rate(smart_git_proxy_sync_total{result="error"}[5m])) by (instance) > 0.5
         for: 5m
         labels:
           severity: warning
         annotations:
-          summary: "p95 clone latency >30s on {{ $labels.instance }}"
-          description: "Slow git clone serving — may indicate NVMe I/O pressure or excessive concurrent requests."
-
-      - alert: HighEvictionRate
-        expr: sum(rate(smart_git_proxy_evictions_total[5m])) by (instance) > 1
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High LRU eviction rate on {{ $labels.instance }}"
-          description: "Evicting >1 mirror/sec sustained. May indicate insufficient NVMe capacity for the working set."
+          summary: "Elevated sync failures on {{ $labels.instance }}"
+          description: "Upstream GitHub sync failures at {{ $value | humanize }}/sec. May indicate rate limiting or network issues."
diff --git a/grafana/dashboards/smart-git-proxy.json b/grafana/dashboards/smart-git-proxy.json
index f3c1fb9..728d146 100644
--- a/grafana/dashboards/smart-git-proxy.json
+++ b/grafana/dashboards/smart-git-proxy.json
@@ -27,14 +27,14 @@
     },
     {
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "title": "Clone Requests / sec",
-      "description": "Rate of git clone/fetch requests served by the proxy.",
+      "title": "Requests / sec",
+      "description": "Rate of git requests served by the proxy (all kinds: info-refs, upload-pack).",
       "type": "timeseries",
       "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
       "id": 1,
       "targets": [
         {
-          "expr": "sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
+          "expr": "sum(rate(smart_git_proxy_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
           "legendFormat": "{{ instance }}"
         }
       ],
@@ -47,44 +47,42 @@
     },
     {
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "title": "Cache Hit Rate",
-      "description": "Percentage of requests served from warm local mirror (no upstream GitHub fetch).",
+      "title": "Requests by Kind",
+      "description": "Request rate broken down by kind (info-refs, upload-pack).",
       "type": "timeseries",
       "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
       "id": 2,
       "targets": [
         {
-          "expr": "sum(rate(smart_git_proxy_cache_hits_total{instance=~\"$instance\"}[$__rate_interval])) / sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) * 100",
-          "legendFormat": "hit rate %"
+          "expr": "sum(rate(smart_git_proxy_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (kind)",
+          "legendFormat": "{{ kind }}"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent",
-          "min": 0,
-          "max": 100,
+          "unit": "reqps",
           "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
         }
       }
     },
     {
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "title": "Clone Latency (p50 / p95 / p99)",
-      "description": "Time to serve a git clone/fetch request from local NVMe mirror.",
+      "title": "Request Latency (p50 / p95 / p99)",
+      "description": "Time to serve a git request from local NVMe mirror.",
       "type": "timeseries",
       "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
       "id": 3,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
+          "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
           "legendFormat": "p50"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
+          "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
           "legendFormat": "p95"
         },
         {
-          "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
+          "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
           "legendFormat": "p99"
         }
       ],
@@ -97,15 +95,15 @@
     },
     {
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "title": "Upstream GitHub Fetches / sec",
-      "description": "Rate of upstream fetches to GitHub (should be low if mirrors are warm).",
+      "title": "Mirror Syncs / sec",
+      "description": "Rate of mirror sync operations (upstream fetches from GitHub), by result.",
       "type": "timeseries",
       "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
       "id": 4,
       "targets": [
         {
-          "expr": "sum(rate(smart_git_proxy_upstream_fetches_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
-          "legendFormat": "{{ instance }}"
+          "expr": "sum(rate(smart_git_proxy_sync_total{instance=~\"$instance\"}[$__rate_interval])) by (result)",
+          "legendFormat": "{{ result }}"
         }
       ],
       "fieldConfig": {
@@ -119,77 +117,52 @@
       "collapsed": false,
       "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 },
       "id": 101,
-      "title": "Storage",
+      "title": "Responses & Errors",
       "type": "row"
     },
     {
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "title": "Mirror Count",
-      "description": "Number of bare git mirrors currently on disk.",
-      "type": "stat",
-      "gridPos": { "h": 6, "w": 6, "x": 0, "y": 18 },
+      "title": "Response Status",
+      "description": "Responses by HTTP status code.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
       "id": 5,
       "targets": [
         {
-          "expr": "sum(smart_git_proxy_mirrors_total{instance=~\"$instance\"}) by (instance)",
-          "legendFormat": "{{ instance }}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": { "unit": "short" }
-      }
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "title": "NVMe Usage %",
-      "description": "Disk usage percentage of the mirror storage directory.",
-      "type": "gauge",
-      "gridPos": { "h": 6, "w": 6, "x": 6, "y": 18 },
-      "id": 6,
-      "targets": [
-        {
-          "expr": "smart_git_proxy_disk_usage_ratio{instance=~\"$instance\"} * 100",
-          "legendFormat": "{{ instance }}"
+          "expr": "sum(rate(smart_git_proxy_responses_total{instance=~\"$instance\"}[$__rate_interval])) by (status)",
+          "legendFormat": "{{ status }}"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent",
-          "min": 0,
-          "max": 100,
-          "thresholds": {
-            "steps": [
-              { "color": "green", "value": null },
-              { "color": "yellow", "value": 60 },
-              { "color": "red", "value": 80 }
-            ]
-          }
+          "unit": "reqps",
+          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
         }
       }
     },
     {
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "title": "NVMe Usage (bytes)",
-      "description": "Absolute disk usage over time for the mirror directory.",
+      "title": "Errors / sec",
+      "description": "Error rate by repo and kind.",
       "type": "timeseries",
-      "gridPos": { "h": 6, "w": 12, "x": 12, "y": 18 },
-      "id": 7,
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
+      "id": 6,
       "targets": [
         {
-          "expr": "smart_git_proxy_disk_usage_bytes{instance=~\"$instance\"}",
+          "expr": "sum(rate(smart_git_proxy_errors_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
           "legendFormat": "{{ instance }}"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "bytes",
+          "unit": "reqps",
           "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
         }
       }
     },
     {
       "collapsed": false,
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
       "id": 102,
       "title": "Per-Repo Traffic",
       "type": "row"
@@ -197,13 +170,13 @@
     {
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "title": "Top Repos by Request Rate",
-      "description": "Clone/fetch request rate broken down by repository.",
+      "description": "Request rate broken down by repository.",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 25 },
-      "id": 8,
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 27 },
+      "id": 7,
       "targets": [
         {
-          "expr": "topk(10, sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (repo))",
+          "expr": "topk(10, sum(rate(smart_git_proxy_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (repo))",
           "legendFormat": "{{ repo }}"
         }
       ],
@@ -213,53 +186,6 @@
           "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
         }
       }
-    },
-    {
-      "collapsed": false,
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 },
-      "id": 103,
-      "title": "Errors",
-      "type": "row"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "title": "Upstream Fetch Errors / sec",
-      "description": "Failed upstream GitHub fetches.",
-      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 },
-      "id": 9,
-      "targets": [
-        {
-          "expr": "sum(rate(smart_git_proxy_upstream_fetch_errors_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
-          "legendFormat": "{{ instance }}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "reqps",
-          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
-        }
-      }
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "title": "LRU Evictions / sec",
-      "description": "Rate of mirrors evicted to free disk space.",
-      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 },
-      "id": 10,
-      "targets": [
-        {
-          "expr": "sum(rate(smart_git_proxy_evictions_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
-          "legendFormat": "{{ instance }}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "reqps",
-          "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }
-        }
-      }
     }
   ],
   "templating": {
@@ -274,7 +200,7 @@
         "name": "instance",
         "type": "query",
         "datasource": { "type": "prometheus", "uid": "${datasource}" },
-        "query": "label_values(smart_git_proxy_clone_requests_total, instance)",
+        "query": "label_values(smart_git_proxy_requests_total, instance)",
         "includeAll": true,
         "allValue": ".*",
         "multi": true,

From aa31b37817742795233f74c3ba6a142aaba4fd4b Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Tue, 16 Jun 2026 22:07:28 +0000
Subject: [PATCH 03/17] fix review feedback: GO=go override, deploy guard,
 histogram grouping, vector perms

- roll.yaml: export GO=go before make build (Makefile defaults to
  mise exec -- go which isn't on deploy targets)
- roll.yaml: fail early if neither branch nor release_tag provided,
  preventing needless service restart
- grafana: group histogram_quantile by (le, instance) to avoid
  merging latencies across nodes
- setup_axiom: tighten vector.yaml to 0600 (contains axiom_token)

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/roll.yaml                       |  7 +++++++
 ansible/tasks/setup_axiom.yaml          |  2 +-
 grafana/dashboards/smart-git-proxy.json | 12 ++++++------
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/ansible/roll.yaml b/ansible/roll.yaml
index 4b38278..8fe2186 100644
--- a/ansible/roll.yaml
+++ b/ansible/roll.yaml
@@ -48,6 +48,7 @@
       shell: |
         set -euo pipefail
         export PATH=/usr/local/go/bin:$PATH
+        export GO=go
         cd /tmp/smart-git-proxy-build
         make build
       args:
@@ -88,6 +89,12 @@
         executable: /bin/bash
       when: branch == "" and release_tag != ""
 
+    # --- Validate deploy target ---
+    - name: Fail if neither branch nor release_tag provided
+      fail:
+        msg: "Must specify either -e branch=<branch> or -e release_tag=<tag>"
+      when: branch == "" and release_tag == ""
+
     # --- Restart ---
     - name: Restart smart-git-proxy
       systemd:
diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml
index 4d98135..6a3e2f1 100644
--- a/ansible/tasks/setup_axiom.yaml
+++ b/ansible/tasks/setup_axiom.yaml
@@ -39,7 +39,7 @@
   template:
     src: templates/vector.yaml.j2
     dest: /etc/vector/vector.yaml
-    mode: "0644"
+    mode: "0600"
   register: vector_config
 
 - name: Enable and restart Vector
diff --git a/grafana/dashboards/smart-git-proxy.json b/grafana/dashboards/smart-git-proxy.json
index 728d146..fdfae93 100644
--- a/grafana/dashboards/smart-git-proxy.json
+++ b/grafana/dashboards/smart-git-proxy.json
@@ -74,16 +74,16 @@
       "id": 3,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
-          "legendFormat": "p50"
+          "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le, instance))",
+          "legendFormat": "{{ instance }} p50"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
-          "legendFormat": "p95"
+          "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le, instance))",
+          "legendFormat": "{{ instance }} p95"
         },
         {
-          "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))",
-          "legendFormat": "p99"
+          "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le, instance))",
+          "legendFormat": "{{ instance }} p99"
         }
       ],
       "fieldConfig": {

From 666b88e02150d1e2ecddf096225fb3c1653a37bf Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Tue, 16 Jun 2026 22:34:06 +0000
Subject: [PATCH 04/17] ansible: add update_cache to vector apt install

Ensures apt cache is refreshed after adding the Vector repo so the
package is discoverable on first run.

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/tasks/setup_axiom.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml
index 6a3e2f1..cf3a2f9 100644
--- a/ansible/tasks/setup_axiom.yaml
+++ b/ansible/tasks/setup_axiom.yaml
@@ -9,6 +9,7 @@
   ansible.builtin.apt:
     name: vector
     state: present
+    update_cache: yes
 
 - name: Get hostname
   shell: hostname

From 2fdf83acc8ef4bdc62b6f2915a30dca85cbca0d1 Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 12:53:07 +0000
Subject: [PATCH 05/17] add CI workflows for build+deploy and grafana dashboard
 sync

- build-and-deploy.yaml: build Go binary, push to R2, deploy via
  Ansible+Tailscale (same pattern as storage-agent)
- deploy-grafana-dashboards.yml: auto-deploy dashboards and alert
  rules to self-hosted Grafana on push to main/production

Required secrets: R2_ACCESS_KEY_ID, R2_SECRET_ACCESS_KEY,
TS_OAUTH_CLIENT_ID, TS_OAUTH_SECRET, ANSIBLE_SECRET,
SELF_HOSTED_GRAFANA_URL, SELF_HOSTED_GRAFANA_USER,
SELF_HOSTED_GRAFANA_PASSWORD

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 .github/workflows/build-and-deploy.yaml       | 176 ++++++++++++++
 .../workflows/deploy-grafana-dashboards.yml   | 229 ++++++++++++++++++
 2 files changed, 405 insertions(+)
 create mode 100644 .github/workflows/build-and-deploy.yaml
 create mode 100644 .github/workflows/deploy-grafana-dashboards.yml

diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml
new file mode 100644
index 0000000..9431e86
--- /dev/null
+++ b/.github/workflows/build-and-deploy.yaml
@@ -0,0 +1,176 @@
+name: Build and Deploy
+
+on:
+  push:
+    branches:
+      - main
+      - production
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: "Branch to deploy (main/production)"
+        required: true
+        type: choice
+        options:
+          - main
+          - production
+        default: "main"
+      run_deployment:
+        description: "Run ansible deployment after build"
+        required: true
+        type: boolean
+        default: false
+
+concurrency:
+  group: "build-and-deploy-${{ github.ref == 'refs/heads/production' && 'production' || github.event.inputs.branch == 'production' && 'production' || 'staging' }}"
+  cancel-in-progress: false
+
+jobs:
+  build-and-deploy:
+    runs-on: blacksmith-8vcpu-ubuntu-2204
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+
+      - name: Build binary
+        run: |
+          export GO=go
+          make build
+          cp bin/smart-git-proxy ./smart-git-proxy
+
+      - name: Run tests
+        if: github.event_name == 'pull_request'
+        run: go test ./...
+
+      - name: Install rclone
+        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch'
+        run: curl https://rclone.org/install.sh | sudo bash
+
+      - name: Configure rclone
+        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch'
+        env:
+          R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
+          R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
+        run: |
+          mkdir -p ~/.config/rclone
+          cat > ~/.config/rclone/rclone.conf <<EOF
+          [r2]
+          type = s3
+          provider = Cloudflare
+          access_key_id = $R2_ACCESS_KEY_ID
+          secret_access_key = $R2_SECRET_ACCESS_KEY
+          endpoint = https://1ede90a8395416f286ba9f692dc6bacf.r2.cloudflarestorage.com
+          EOF
+
+      - name: Push binary to R2
+        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch'
+        run: |
+          SHA=$(git rev-parse HEAD)
+          echo "SHA=$SHA" >> $GITHUB_ENV
+          if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then
+            rclone copy ./smart-git-proxy r2:useblacksmith/smart-git-proxy/production/$SHA
+          else
+            rclone copy ./smart-git-proxy r2:useblacksmith/smart-git-proxy/main/$SHA
+          fi
+
+      - name: Set up Python
+        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+
+      - name: Install Ansible
+        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        run: |
+          python -m pip install --upgrade pip
+          pip install ansible
+
+      - name: Connect to Tailscale
+        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        uses: tailscale/github-action@v3
+        with:
+          oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }}
+          oauth-secret: ${{ secrets.TS_OAUTH_SECRET }}
+          tags: tag:ci
+
+      - name: Create Ansible Vault password file
+        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        run: echo "${{ secrets.ANSIBLE_SECRET }}" > ~/vault-password.txt
+
+      - name: Check host connectivity
+        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        working-directory: ansible
+        run: |
+          sleep 10
+          if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then
+            INVENTORY_FILES="production-usw.ini production-euw.ini production-euc.ini"
+          else
+            INVENTORY_FILES="staging.ini"
+          fi
+          for INVENTORY_FILE in $INVENTORY_FILES; do
+            echo "=== Checking hosts in $INVENTORY_FILE ==="
+            HOSTS=$(ansible-inventory -i $INVENTORY_FILE --list | jq -r '._meta.hostvars | keys[]')
+            if [ -z "$HOSTS" ]; then
+              echo "warning: no hosts found in $INVENTORY_FILE, skipping"
+              continue
+            fi
+            for host in $HOSTS; do
+              echo "Testing connectivity to $host..."
+              start_time=$(date +%s)
+              while true; do
+                if tailscale ping -c 1 --timeout=5s $host >/dev/null 2>&1; then
+                  echo "$host is reachable"
+                  break
+                fi
+                current_time=$(date +%s)
+                elapsed=$((current_time - start_time))
+                if [ $elapsed -ge 30 ]; then
+                  echo "error: timeout after 30s waiting for $host"
+                  exit 1
+                fi
+                echo "Waiting for $host... (${elapsed}s elapsed)"
+                sleep 5
+              done
+            done
+          done
+        env:
+          ANSIBLE_HOST_KEY_CHECKING: "False"
+
+      - name: Run Ansible rolling deploy
+        id: ansible-deploy
+        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        working-directory: ansible
+        run: |
+          BRANCH_ARG="-e branch=${{ github.ref_name }}"
+          if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then
+            for REGION_INI in production-usw.ini production-euw.ini production-euc.ini; do
+              echo "=== Rolling region: $REGION_INI ==="
+              ANSIBLE_CONFIG=./ansible.cfg ansible-playbook -i "$REGION_INI" \
+                --vault-password-file ~/vault-password.txt \
+                roll.yaml $BRANCH_ARG -v
+            done
+          else
+            ANSIBLE_CONFIG=./ansible.cfg ansible-playbook -i staging.ini \
+              --vault-password-file ~/vault-password.txt \
+              roll.yaml $BRANCH_ARG -v
+          fi
+        env:
+          ANSIBLE_HOST_KEY_CHECKING: "False"
+
+      - name: Send Slack notification on failure
+        if: failure() && steps.ansible-deploy.outcome == 'failure'
+        uses: slackapi/slack-github-action@v1
+        with:
+          payload: |
+            {
+              "text": "Ansible deploy failed for smart-git-proxy! Branch: ${{ github.ref_name || github.event.inputs.branch }}, Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+            }
+        env:
+          SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/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"
diff --git a/.github/workflows/deploy-grafana-dashboards.yml b/.github/workflows/deploy-grafana-dashboards.yml
new file mode 100644
index 0000000..a207aec
--- /dev/null
+++ b/.github/workflows/deploy-grafana-dashboards.yml
@@ -0,0 +1,229 @@
+name: Deploy Dashboards & Alerts to Self-Hosted Grafana
+
+on:
+  push:
+    branches: [main, production]
+    paths:
+      - "grafana/dashboards/*.json"
+      - "grafana/alert-rules/*.yaml"
+      - "grafana/alert-rules/*.yml"
+      - ".github/workflows/deploy-grafana-dashboards.yml"
+  pull_request:
+    paths:
+      - "grafana/dashboards/*.json"
+      - "grafana/alert-rules/*.yaml"
+      - "grafana/alert-rules/*.yml"
+      - ".github/workflows/deploy-grafana-dashboards.yml"
+  workflow_dispatch:
+
+jobs:
+  deploy-dashboards:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set folder name
+        run: |
+          if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then
+            echo "FOLDER_NAME=Smart Git Proxy Production" >> $GITHUB_ENV
+          else
+            echo "FOLDER_NAME=Smart Git Proxy Staging" >> $GITHUB_ENV
+          fi
+
+      - name: Validate JSON syntax
+        run: |
+          for dashboard in grafana/dashboards/*.json; do
+            echo "Validating $dashboard..."
+            jq . "$dashboard" > /dev/null || exit 1
+          done
+          echo "All dashboard JSON files are valid"
+
+      - name: Deploy dashboards to Self-Hosted Grafana
+        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production'
+        env:
+          GRAFANA_URL: ${{ secrets.SELF_HOSTED_GRAFANA_URL }}
+          GRAFANA_USER: ${{ secrets.SELF_HOSTED_GRAFANA_USER }}
+          GRAFANA_PASSWORD: ${{ secrets.SELF_HOSTED_GRAFANA_PASSWORD }}
+        run: |
+          if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then
+            UID_SUFFIX="-prod-self"
+          else
+            UID_SUFFIX="-staging-self"
+          fi
+
+          AUTH_HEADER="Authorization: Basic $(echo -n "$GRAFANA_USER:$GRAFANA_PASSWORD" | base64)"
+
+          # Ensure folder exists
+          FOLDERS_RESPONSE=$(curl -s -H "$AUTH_HEADER" "$GRAFANA_URL/api/folders")
+          FOLDER_UID=$(echo "$FOLDERS_RESPONSE" | jq -r --arg name "$FOLDER_NAME" '.[] | select(.title == $name) | .uid' | head -1)
+
+          if [ -z "$FOLDER_UID" ] || [ "$FOLDER_UID" == "null" ]; then
+            FOLDER_UID=$(echo "$FOLDER_NAME" | tr '[:upper:]' '[:lower:]' | tr ' ' '-')-$(date +%s)
+            CREATE_RESPONSE=$(curl -s -X POST \
+              -H "$AUTH_HEADER" \
+              -H "Content-Type: application/json" \
+              -d "{\"title\": \"$FOLDER_NAME\", \"uid\": \"$FOLDER_UID\"}" \
+              "$GRAFANA_URL/api/folders")
+            if ! echo "$CREATE_RESPONSE" | grep -q '"uid"'; then
+              echo "Error creating folder: $CREATE_RESPONSE"
+              exit 1
+            fi
+          fi
+
+          echo "FOLDER_UID=$FOLDER_UID" >> $GITHUB_ENV
+
+          for dashboard_file in grafana/dashboards/*.json; do
+            if [ -f "$dashboard_file" ]; then
+              dashboard_name=$(basename "$dashboard_file" .json)
+              echo "Uploading $dashboard_name..."
+
+              DASHBOARD_JSON=$(cat "$dashboard_file")
+
+              # Append environment suffix to UID
+              if echo "$DASHBOARD_JSON" | jq -e '.uid' > /dev/null 2>&1; then
+                ORIGINAL_UID=$(echo "$DASHBOARD_JSON" | jq -r '.uid')
+                DASHBOARD_JSON=$(echo "$DASHBOARD_JSON" | jq --arg uid "${ORIGINAL_UID}${UID_SUFFIX}" '.uid = $uid')
+              else
+                DASHBOARD_JSON=$(echo "$DASHBOARD_JSON" | jq --arg uid "${dashboard_name}${UID_SUFFIX}" '. + {uid: $uid}')
+              fi
+
+              PAYLOAD_FILE=$(mktemp)
+              echo "$DASHBOARD_JSON" | jq \
+                --arg folderUid "$FOLDER_UID" \
+                '{dashboard: ., folderUid: $folderUid, overwrite: true}' > "$PAYLOAD_FILE"
+
+              RESPONSE=$(curl -s -X POST \
+                -H "$AUTH_HEADER" \
+                -H "Content-Type: application/json" \
+                --data-binary "@$PAYLOAD_FILE" \
+                "$GRAFANA_URL/api/dashboards/db")
+              rm -f "$PAYLOAD_FILE"
+
+              if echo "$RESPONSE" | grep -q '"status":"success"'; then
+                echo "  Uploaded $dashboard_name (version: $(echo "$RESPONSE" | jq -r .version))"
+              else
+                echo "  Failed: $RESPONSE"
+                echo "  Continuing..."
+              fi
+            fi
+          done
+
+      - name: Deploy alert rules to Self-Hosted Grafana
+        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production'
+        env:
+          GRAFANA_URL: ${{ secrets.SELF_HOSTED_GRAFANA_URL }}
+          GRAFANA_USER: ${{ secrets.SELF_HOSTED_GRAFANA_USER }}
+          GRAFANA_PASSWORD: ${{ secrets.SELF_HOSTED_GRAFANA_PASSWORD }}
+        run: |
+          AUTH_HEADER="Authorization: Basic $(echo -n "$GRAFANA_USER:$GRAFANA_PASSWORD" | base64)"
+
+          if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then
+            DASHBOARD_UID_SUFFIX="-prod-self"
+          else
+            DASHBOARD_UID_SUFFIX="-staging-self"
+          fi
+
+          # Resolve Prometheus datasource UID
+          DATASOURCES_RESPONSE=$(curl -s -H "$AUTH_HEADER" "$GRAFANA_URL/api/datasources")
+          DATASOURCE_UID=$(echo "$DATASOURCES_RESPONSE" | jq -r '[.[] | select(.type=="prometheus")] | .[0].uid // empty')
+          if [ -z "$DATASOURCE_UID" ] || [ "$DATASOURCE_UID" = "null" ]; then
+            echo "Could not resolve Prometheus datasource UID"
+            exit 1
+          fi
+          echo "Using Prometheus datasource UID: $DATASOURCE_UID"
+
+          # Use same folder as dashboards
+          ALERT_FOLDER_UID="${FOLDER_UID}"
+
+          shopt -s nullglob
+          alert_files=(grafana/alert-rules/*.yaml grafana/alert-rules/*.yml)
+          if [ ${#alert_files[@]} -eq 0 ]; then
+            echo "No alert rule files found, skipping"
+            exit 0
+          fi
+
+          # Install yq
+          if ! command -v yq &> /dev/null; then
+            wget -q https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O yq
+            chmod +x yq
+            YQ_CMD="./yq"
+          else
+            YQ_CMD="yq"
+          fi
+
+          for alert_file in "${alert_files[@]}"; do
+            alert_name=$(basename "$alert_file" .yaml)
+            alert_name=$(basename "$alert_name" .yml)
+            echo "Uploading alert rules from $alert_name..."
+
+            ALERT_JSON=$($YQ_CMD eval -o=json "$alert_file")
+
+            # Replace datasource placeholders and template variables
+            ALERT_JSON=$(echo "$ALERT_JSON" | jq --arg uid "$DATASOURCE_UID" '
+              .groups |= map(
+                del(.folder) |
+                .rules |= map(
+                  .data |= map(
+                    (if (has("datasourceUid") and .datasourceUid == "${datasource}") then
+                      .datasourceUid = $uid
+                    else . end) |
+                    if has("model") and (.model | has("expr")) then
+                      .model.expr = (.model.expr |
+                        gsub("\\$instance"; ".*") |
+                        gsub("instance=~\"\\*\""; "instance=~\".*\""))
+                    else . end
+                  )
+                )
+              )')
+
+            GROUPS_COUNT=$(echo "$ALERT_JSON" | jq '.groups | length')
+            for group_idx in $(seq 0 $((GROUPS_COUNT - 1))); do
+              RULES_COUNT=$(echo "$ALERT_JSON" | jq --argjson g "$group_idx" '.groups[$g].rules | length')
+              GROUP_NAME=$(echo "$ALERT_JSON" | jq -r --argjson g "$group_idx" '.groups[$g].name')
+              GROUP_INTERVAL=$(echo "$ALERT_JSON" | jq -r --argjson g "$group_idx" '.groups[$g].interval // "1m"')
+
+              for rule_idx in $(seq 0 $((RULES_COUNT - 1))); do
+                RULE=$(echo "$ALERT_JSON" | jq --argjson g "$group_idx" --argjson idx "$rule_idx" '.groups[$g].rules[$idx]')
+                ORIGINAL_RULE_UID=$(echo "$RULE" | jq -r '.uid')
+
+                if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then
+                  RULE_UID="${ORIGINAL_RULE_UID}_p"
+                else
+                  RULE_UID="${ORIGINAL_RULE_UID}_s"
+                fi
+
+                RULE_PAYLOAD=$(echo "$RULE" | jq \
+                  --arg folderUid "$ALERT_FOLDER_UID" \
+                  --arg groupName "$GROUP_NAME" \
+                  --arg interval "$GROUP_INTERVAL" \
+                  --arg ruleUid "$RULE_UID" '
+                  . + {uid: $ruleUid, folderUID: $folderUid, ruleGroup: $groupName, interval: $interval}')
+
+                RULE_PAYLOAD=$(echo "$RULE_PAYLOAD" | jq --arg suffix "$DASHBOARD_UID_SUFFIX" '
+                  if (.annotations.__dashboardUid__ // "") != "" then
+                    .annotations.__dashboardUid__ += $suffix
+                  else . end')
+
+                # Upsert rule
+                EXISTING_RULE=$(curl -s -H "$AUTH_HEADER" "$GRAFANA_URL/api/v1/provisioning/alert-rules/$RULE_UID")
+                if echo "$EXISTING_RULE" | grep -q '"uid"'; then
+                  RESPONSE=$(curl -s -X PUT \
+                    -H "$AUTH_HEADER" -H "Content-Type: application/json" -H "X-Disable-Provenance: true" \
+                    -d "$RULE_PAYLOAD" "$GRAFANA_URL/api/v1/provisioning/alert-rules/$RULE_UID")
+                else
+                  RESPONSE=$(curl -s -X POST \
+                    -H "$AUTH_HEADER" -H "Content-Type: application/json" -H "X-Disable-Provenance: true" \
+                    -d "$RULE_PAYLOAD" "$GRAFANA_URL/api/v1/provisioning/alert-rules")
+                fi
+
+                if echo "$RESPONSE" | grep -q '"uid"'; then
+                  echo "  Processed rule: $RULE_UID"
+                else
+                  echo "  Failed rule $RULE_UID: $RESPONSE"
+                  exit 1
+                fi
+              done
+            done
+          done
+
+          echo "Alert rules deployment complete!"

From f3198b971fe30c6fe601197156d3f3e15f57031e Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 12:56:39 +0000
Subject: [PATCH 06/17] fix bugbot issues: deploy gating, branch resolution,
 alert UIDs

- Add deploy-target step that centralizes env/branch resolution;
  production pushes now correctly trigger Ansible deploy
- Manual workflow_dispatch uses inputs.branch for Ansible instead of
  the checked-out ref name
- Add unique uid to each alert rule (sgp-node-down, sgp-error-rate,
  sgp-latency-high, sgp-sync-failures) so Grafana provisioning
  upserts correctly

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 .github/workflows/build-and-deploy.yaml       | 30 +++++++++++++------
 .../alert-rules/smart-git-proxy-alerts.yaml   | 12 +++++---
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml
index 9431e86..aaf72f1 100644
--- a/.github/workflows/build-and-deploy.yaml
+++ b/.github/workflows/build-and-deploy.yaml
@@ -80,20 +80,32 @@ jobs:
             rclone copy ./smart-git-proxy r2:useblacksmith/smart-git-proxy/main/$SHA
           fi
 
+      - name: Determine deploy target
+        id: deploy-target
+        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        run: |
+          if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then
+            echo "env=production" >> $GITHUB_OUTPUT
+            echo "branch=production" >> $GITHUB_OUTPUT
+          else
+            echo "env=staging" >> $GITHUB_OUTPUT
+            echo "branch=${{ github.event.inputs.branch || github.ref_name }}" >> $GITHUB_OUTPUT
+          fi
+
       - name: Set up Python
-        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        if: steps.deploy-target.outcome == 'success'
         uses: actions/setup-python@v5
         with:
           python-version: "3.x"
 
       - name: Install Ansible
-        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        if: steps.deploy-target.outcome == 'success'
         run: |
           python -m pip install --upgrade pip
           pip install ansible
 
       - name: Connect to Tailscale
-        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        if: steps.deploy-target.outcome == 'success'
         uses: tailscale/github-action@v3
         with:
           oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }}
@@ -101,15 +113,15 @@ jobs:
           tags: tag:ci
 
       - name: Create Ansible Vault password file
-        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        if: steps.deploy-target.outcome == 'success'
         run: echo "${{ secrets.ANSIBLE_SECRET }}" > ~/vault-password.txt
 
       - name: Check host connectivity
-        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        if: steps.deploy-target.outcome == 'success'
         working-directory: ansible
         run: |
           sleep 10
-          if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then
+          if [[ "${{ steps.deploy-target.outputs.env }}" == "production" ]]; then
             INVENTORY_FILES="production-usw.ini production-euw.ini production-euc.ini"
           else
             INVENTORY_FILES="staging.ini"
@@ -145,11 +157,11 @@ jobs:
 
       - name: Run Ansible rolling deploy
         id: ansible-deploy
-        if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true')
+        if: steps.deploy-target.outcome == 'success'
         working-directory: ansible
         run: |
-          BRANCH_ARG="-e branch=${{ github.ref_name }}"
-          if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then
+          BRANCH_ARG="-e branch=${{ steps.deploy-target.outputs.branch }}"
+          if [[ "${{ steps.deploy-target.outputs.env }}" == "production" ]]; then
             for REGION_INI in production-usw.ini production-euw.ini production-euc.ini; do
               echo "=== Rolling region: $REGION_INI ==="
               ANSIBLE_CONFIG=./ansible.cfg ansible-playbook -i "$REGION_INI" \
diff --git a/grafana/alert-rules/smart-git-proxy-alerts.yaml b/grafana/alert-rules/smart-git-proxy-alerts.yaml
index 441e82f..42d5296 100644
--- a/grafana/alert-rules/smart-git-proxy-alerts.yaml
+++ b/grafana/alert-rules/smart-git-proxy-alerts.yaml
@@ -12,7 +12,8 @@ groups:
   - name: smart-git-proxy
     interval: 1m
     rules:
-      - alert: ProxyNodeDown
+      - uid: sgp-node-down
+        alert: ProxyNodeDown
         expr: up{job="smart-git-proxy"} == 0
         for: 2m
         labels:
@@ -21,7 +22,8 @@ groups:
           summary: "Smart git proxy node {{ $labels.instance }} is down"
           description: "Health endpoint unreachable for 2 minutes."
 
-      - alert: ErrorRateHigh
+      - uid: sgp-error-rate
+        alert: ErrorRateHigh
         expr: sum(rate(smart_git_proxy_errors_total[5m])) by (instance) > 0.5
         for: 5m
         labels:
@@ -30,7 +32,8 @@ groups:
           summary: "Elevated error rate on {{ $labels.instance }}"
           description: "Errors at {{ $value | humanize }}/sec. May indicate upstream GitHub issues or local git failures."
 
-      - alert: RequestLatencyHigh
+      - uid: sgp-latency-high
+        alert: RequestLatencyHigh
         expr: histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket[5m])) by (le, instance)) > 30
         for: 5m
         labels:
@@ -39,7 +42,8 @@ groups:
           summary: "p95 request latency >30s on {{ $labels.instance }}"
           description: "Slow git request serving — may indicate NVMe I/O pressure or excessive concurrent requests."
 
-      - alert: SyncFailureRateHigh
+      - uid: sgp-sync-failures
+        alert: SyncFailureRateHigh
         expr: sum(rate(smart_git_proxy_sync_total{result="error"}[5m])) by (instance) > 0.5
         for: 5m
         labels:

From 5861eca7c1f03e18cd94606245775880a1440b72 Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 13:24:02 +0000
Subject: [PATCH 07/17] ansible: add hydrate playbook for raw machine
 bootstrapping

- Populate staging.ini and production-usw.ini with provisioned IPs
- Add hydrate.yaml: full bootstrapping (hostname, packages, Tailscale,
  NVMe format+mount, Grafana Alloy, service config, Vector/Axiom)
- Add task files: setup_hostname, install_dependencies, setup_tailscale,
  configure_tailscale_firewall, setup_nvme, setup_grafana_alloy,
  setup_service
- Add requirements.yml for artis3n.tailscale collection
- Refactor setup.yaml to reuse shared tasks (NVMe, Alloy, service, Axiom)
- Add Grafana Alloy for Prometheus metric scraping (scrapes /metrics,
  remote writes to self-hosted Prometheus)

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/README.md                             | 65 ++++++++++++---
 ansible/ansible.cfg                           |  3 +
 ansible/hydrate.yaml                          | 38 +++++++++
 ansible/production-usw.ini                    | 11 +--
 ansible/requirements.yml                      |  4 +
 ansible/secrets.yml                           |  2 +-
 ansible/setup.yaml                            | 65 +++------------
 ansible/staging.ini                           |  7 +-
 .../tasks/configure_tailscale_firewall.yaml   | 43 ++++++++++
 ansible/tasks/install_dependencies.yaml       | 24 ++++++
 ansible/tasks/setup_grafana_alloy.yaml        | 74 +++++++++++++++++
 ansible/tasks/setup_hostname.yaml             | 19 +++++
 ansible/tasks/setup_nvme.yaml                 | 82 +++++++++++++++++++
 ansible/tasks/setup_service.yaml              | 33 ++++++++
 ansible/tasks/setup_tailscale.yaml            | 23 ++++++
 15 files changed, 419 insertions(+), 74 deletions(-)
 create mode 100644 ansible/hydrate.yaml
 create mode 100644 ansible/requirements.yml
 create mode 100644 ansible/tasks/configure_tailscale_firewall.yaml
 create mode 100644 ansible/tasks/install_dependencies.yaml
 create mode 100644 ansible/tasks/setup_grafana_alloy.yaml
 create mode 100644 ansible/tasks/setup_hostname.yaml
 create mode 100644 ansible/tasks/setup_nvme.yaml
 create mode 100644 ansible/tasks/setup_service.yaml
 create mode 100644 ansible/tasks/setup_tailscale.yaml

diff --git a/ansible/README.md b/ansible/README.md
index d9c75a5..bc999f0 100644
--- a/ansible/README.md
+++ b/ansible/README.md
@@ -6,39 +6,70 @@ Playbooks and templates for deploying smart-git-proxy to dedicated proxy nodes.
 
 ```
 ansible/
-  setup.yaml               # One-time setup: systemd unit, NVMe mount, Vector logging
-  roll.yaml                # Rolling deploy: build or download binary, restart
+  hydrate.yaml               # Full bootstrapping: hostname, packages, Tailscale,
+                              # NVMe, Alloy, service config, Vector logging
+  setup.yaml                 # Incremental setup: NVMe, Alloy, service config, Vector
+  roll.yaml                  # Rolling deploy: build or download binary, restart
+  requirements.yml           # Ansible Galaxy dependencies (artis3n.tailscale)
   tasks/
-    setup_axiom.yaml        # Vector → Axiom log forwarding
+    setup_hostname.yaml       # Set hostname and /etc/hosts
+    install_dependencies.yaml # apt packages (git, make, jq, xfsprogs, etc.)
+    setup_tailscale.yaml      # Install and configure Tailscale with SSH
+    configure_tailscale_firewall.yaml  # UFW rules to block netscanning
+    setup_nvme.yaml           # Detect, format (XFS), and mount NVMe data drive
+    setup_grafana_alloy.yaml  # Prometheus scrape → remote write to Grafana
+    setup_service.yaml        # systemd unit and env config
+    setup_axiom.yaml          # Vector → Axiom log forwarding
   templates/
     smart-git-proxy.service.j2   # systemd unit
     smart-git-proxy-env.j2       # Environment config
     vector.yaml.j2               # Vector config for Axiom
-  production-usw.ini        # US-West inventory (3 nodes)
-  production-euw.ini        # EU-West inventory (2 nodes)
-  production-euc.ini        # EU-Central inventory (2 nodes)
-  staging.ini               # Staging inventory
-  secrets.yml               # ansible-vault encrypted secrets
+  production-usw.ini          # US-West inventory (3 nodes)
+  production-euw.ini          # EU-West inventory (placeholder)
+  production-euc.ini          # EU-Central inventory (placeholder)
+  staging.ini                 # Staging inventory (1 node)
+  secrets.yml                 # ansible-vault encrypted secrets
+```
+
+## Prerequisites
+
+```bash
+# Install Ansible Galaxy dependencies
+ansible-galaxy collection install -r requirements.yml
 ```
 
 ## Usage
 
-### Initial Setup (once per node)
+### Hydrate Raw Machines (once per node, first time)
+
+Bootstraps a freshly provisioned machine from scratch: sets hostname, installs
+packages, configures Tailscale + firewall, formats NVMe, sets up Alloy metrics,
+systemd service, and Vector log forwarding.
 
 ```bash
-ansible-playbook -i production-usw.ini setup.yaml --ask-vault-pass
+ansible-playbook -i staging.ini hydrate.yaml --vault-password-file ~/vault-password.txt
+ansible-playbook -i production-usw.ini hydrate.yaml --vault-password-file ~/vault-password.txt
+```
+
+### Incremental Setup (re-run safe)
+
+Updates service config, systemd unit, Alloy, and Vector without touching
+Tailscale or re-formatting NVMe. Safe to re-run.
+
+```bash
+ansible-playbook -i production-usw.ini setup.yaml --vault-password-file ~/vault-password.txt
 ```
 
 ### Deploy from Branch
 
 ```bash
-ansible-playbook -i production-usw.ini roll.yaml -e branch=main --ask-vault-pass
+ansible-playbook -i production-usw.ini roll.yaml -e branch=main --vault-password-file ~/vault-password.txt
 ```
 
 ### Deploy from Release
 
 ```bash
-ansible-playbook -i production-usw.ini roll.yaml -e release_tag=v1.0.0 --ask-vault-pass
+ansible-playbook -i production-usw.ini roll.yaml -e release_tag=v1.0.0 --vault-password-file ~/vault-password.txt
 ```
 
 ## Configuration
@@ -58,3 +89,13 @@ Key environment variables (set in `templates/smart-git-proxy-env.j2`):
 `secrets.yml` must contain (ansible-vault encrypted):
 - `github_token` — GitHub token for cloning the repo during branch builds
 - `axiom_token` — Axiom API token for log forwarding
+- `tailscale_auth_key` — Tailscale auth key (tag: `git-proxy`)
+
+## Inventory
+
+Hosts are listed by public IP for initial hydration (SSH over public internet).
+After hydration, Tailscale IPs are used for rolling deploys via CI (the
+build-and-deploy workflow connects through Tailscale VPN).
+
+After hydration, note each node's Tailscale IP and update the inventory files
+if you want to switch to Tailscale-based SSH for subsequent runs.
diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg
index 6d2dcd6..6dca555 100644
--- a/ansible/ansible.cfg
+++ b/ansible/ansible.cfg
@@ -1,3 +1,6 @@
 [defaults]
 host_key_checking = False
 timeout = 30
+
+[galaxy]
+collections_path = ~/.ansible/collections
diff --git a/ansible/hydrate.yaml b/ansible/hydrate.yaml
new file mode 100644
index 0000000..fd3c219
--- /dev/null
+++ b/ansible/hydrate.yaml
@@ -0,0 +1,38 @@
+---
+# Full hydration for raw machines: hostname, packages, Tailscale, NVMe, service config, logging.
+# Run once per node after provisioning.
+#
+# Usage:
+#   ansible-playbook -i staging.ini hydrate.yaml --vault-password-file ~/vault-password.txt
+#   ansible-playbook -i production-usw.ini hydrate.yaml --vault-password-file ~/vault-password.txt
+
+- name: Hydrate Smart Git Proxy Nodes
+  hosts: all
+  become: yes
+  serial: 4
+  vars_files:
+    - secrets.yml
+  tasks:
+    - name: Set hostname
+      include_tasks: tasks/setup_hostname.yaml
+
+    - name: Install base packages
+      include_tasks: tasks/install_dependencies.yaml
+
+    - name: Set up Tailscale
+      include_tasks: tasks/setup_tailscale.yaml
+
+    - name: Configure Tailscale firewall rules
+      include_tasks: tasks/configure_tailscale_firewall.yaml
+
+    - name: Set up NVMe storage
+      include_tasks: tasks/setup_nvme.yaml
+
+    - name: Set up Grafana Alloy for metrics
+      include_tasks: tasks/setup_grafana_alloy.yaml
+
+    - name: Set up service config and systemd
+      include_tasks: tasks/setup_service.yaml
+
+    - name: Set up Axiom log forwarding
+      include_tasks: tasks/setup_axiom.yaml
diff --git a/ansible/production-usw.ini b/ansible/production-usw.ini
index c6bf394..45462f5 100644
--- a/ansible/production-usw.ini
+++ b/ansible/production-usw.ini
@@ -1,10 +1,11 @@
-# US-West proxy nodes (3 nodes)
-# Update with actual IPs after provisioning.
+# US-West proxy nodes (3 nodes, s4.s2.large, PhoenixNAP, 36mo reservation)
+# Provisioned 2026-06-17
 [all:vars]
+ansible_user=ubuntu
 region=us-west
 env=production
 
 [proxy]
-# proxy-usw-1 ansible_host=<IP>
-# proxy-usw-2 ansible_host=<IP>
-# proxy-usw-3 ansible_host=<IP>
+192.240.240.207 hostname=git-proxy-usw-1 server_id=6a329c5bcd195a90018570da
+192.240.240.208 hostname=git-proxy-usw-2 server_id=6a329c5dcd195a90018570db
+192.240.240.209 hostname=git-proxy-usw-3 server_id=6a329c5ecd195a90018570dc
diff --git a/ansible/requirements.yml b/ansible/requirements.yml
new file mode 100644
index 0000000..9141d26
--- /dev/null
+++ b/ansible/requirements.yml
@@ -0,0 +1,4 @@
+---
+collections:
+  - name: artis3n.tailscale
+    version: ">=4.0.0"
diff --git a/ansible/secrets.yml b/ansible/secrets.yml
index e1ed311..f0c5c9c 100644
--- a/ansible/secrets.yml
+++ b/ansible/secrets.yml
@@ -1,3 +1,3 @@
 # Encrypted with ansible-vault.
-# Contains: github_token, axiom_token
+# Contains: github_token, axiom_token, tailscale_auth_key
 # To edit: ansible-vault edit secrets.yml
diff --git a/ansible/setup.yaml b/ansible/setup.yaml
index e86e001..ce3d4d5 100644
--- a/ansible/setup.yaml
+++ b/ansible/setup.yaml
@@ -1,6 +1,9 @@
 ---
-# One-time setup: deploy systemd unit, env config, NVMe mount, Vector logging.
-# Does not start the service -- the roll playbook handles that.
+# Incremental setup: deploy service config, systemd unit, NVMe validation, Vector logging.
+# For raw machine bootstrapping (including Tailscale, NVMe format, packages), use hydrate.yaml.
+#
+# Usage:
+#   ansible-playbook -i production-usw.ini setup.yaml --vault-password-file ~/vault-password.txt
 
 - name: Setup Smart Git Proxy
   hosts: all
@@ -8,58 +11,14 @@
   vars_files:
     - secrets.yml
   tasks:
-    - name: Determine environment from inventory file name
-      set_fact:
-        ENV: "{{ 'production' if 'production' in inventory_file else 'staging' if 'staging' in inventory_file else '' }}"
+    - name: Set up NVMe storage
+      include_tasks: tasks/setup_nvme.yaml
 
-    - name: Ensure ENV is determined
-      fail:
-        msg: "Failed to determine environment from inventory file name."
-      when: ENV == ''
+    - name: Set up Grafana Alloy for metrics
+      include_tasks: tasks/setup_grafana_alloy.yaml
 
-    # --- NVMe storage ---
-    - name: Verify NVMe mount exists
-      stat:
-        path: /mnt/nvme
-      register: nvme_mount
+    - name: Set up service config and systemd
+      include_tasks: tasks/setup_service.yaml
 
-    - name: Fail if NVMe mount is missing
-      fail:
-        msg: "/mnt/nvme does not exist. Proxy nodes require NVMe storage for git mirrors."
-      when: not nvme_mount.stat.exists
-
-    - name: Create mirror directory
-      file:
-        path: "{{ mirror_dir | default('/mnt/nvme/mirrors') }}"
-        state: directory
-        mode: "0755"
-
-    # --- Config ---
-    - name: Ensure config directory exists
-      file:
-        path: /etc/smart-git-proxy
-        state: directory
-        mode: "0755"
-
-    - name: Deploy environment config
-      template:
-        src: templates/smart-git-proxy-env.j2
-        dest: /etc/smart-git-proxy/env
-        mode: "0600"
-
-    # --- systemd ---
-    - name: Deploy systemd unit
-      template:
-        src: templates/smart-git-proxy.service.j2
-        dest: /etc/systemd/system/smart-git-proxy.service
-        mode: "0644"
-      register: systemd_unit
-
-    - name: Reload systemd daemon
-      systemd:
-        daemon_reload: yes
-      when: systemd_unit.changed
-
-    # --- Axiom log forwarding ---
-    - name: Setup Axiom log forwarding
+    - name: Set up Axiom log forwarding
       include_tasks: tasks/setup_axiom.yaml
diff --git a/ansible/staging.ini b/ansible/staging.ini
index b5eb016..7258823 100644
--- a/ansible/staging.ini
+++ b/ansible/staging.ini
@@ -1,8 +1,9 @@
-# Staging proxy nodes (1 node per region or shared)
-# Update with actual IPs after provisioning.
+# Staging proxy node (1 node, s4.s2.large, PhoenixNAP, 36mo reservation)
+# Provisioned 2026-06-17
 [all:vars]
+ansible_user=ubuntu
 region=us-west
 env=staging
 
 [proxy]
-# proxy-staging-1 ansible_host=<IP>
+192.240.240.210 hostname=git-proxy-staging-1 server_id=6a329cb6cd195a90018570de
diff --git a/ansible/tasks/configure_tailscale_firewall.yaml b/ansible/tasks/configure_tailscale_firewall.yaml
new file mode 100644
index 0000000..1b41c03
--- /dev/null
+++ b/ansible/tasks/configure_tailscale_firewall.yaml
@@ -0,0 +1,43 @@
+---
+- name: Ensure UFW is installed
+  apt:
+    name: ufw
+    state: present
+
+- name: Enable UFW with default allow
+  ufw:
+    state: enabled
+    policy: allow
+    direction: incoming
+    logging: "on"
+
+- name: Allow Tailscale traffic within Tailscale network
+  ufw:
+    rule: allow
+    direction: out
+    to_ip: 100.64.0.0/10
+    port: "41641"
+    proto: udp
+
+- name: Block Tailscale UDP scanning to private ranges
+  ufw:
+    rule: deny
+    direction: out
+    to_ip: "{{ item }}"
+    port: "41641"
+    proto: udp
+  loop:
+    - 10.0.0.0/8
+    - 172.16.0.0/12
+    - 192.168.0.0/16
+
+- name: Block all other Tailscale outbound scanning
+  ufw:
+    rule: deny
+    direction: out
+    port: "41641"
+    proto: udp
+
+- name: Reload UFW
+  ufw:
+    state: reloaded
diff --git a/ansible/tasks/install_dependencies.yaml b/ansible/tasks/install_dependencies.yaml
new file mode 100644
index 0000000..6f84aa5
--- /dev/null
+++ b/ansible/tasks/install_dependencies.yaml
@@ -0,0 +1,24 @@
+---
+- name: Wait for dpkg lock
+  shell: while lsof /var/lib/dpkg/lock-frontend 2>/dev/null; do sleep 10; done;
+
+- name: Update apt cache
+  apt:
+    update_cache: yes
+  retries: 5
+  delay: 10
+  register: apt_update
+  until: apt_update is success
+
+- name: Install required packages
+  apt:
+    name:
+      - curl
+      - wget
+      - git
+      - make
+      - jq
+      - xfsprogs
+      - nvme-cli
+      - ufw
+    state: present
diff --git a/ansible/tasks/setup_grafana_alloy.yaml b/ansible/tasks/setup_grafana_alloy.yaml
new file mode 100644
index 0000000..6965037
--- /dev/null
+++ b/ansible/tasks/setup_grafana_alloy.yaml
@@ -0,0 +1,74 @@
+---
+- name: Set Grafana endpoints
+  set_fact:
+    self_hosted_prometheus_url: "http://grafana.internal.blacksmith.sh:9090/api/v1/write"
+
+- name: Install Grafana Alloy
+  shell: |
+    ARCH=$(dpkg --print-architecture)
+    wget -q -O /tmp/alloy.deb "https://github.com/grafana/alloy/releases/download/v1.5.1/alloy-1.5.1-1.${ARCH}.deb"
+    DEBIAN_FRONTEND=noninteractive dpkg -i --force-confnew /tmp/alloy.deb || apt-get install -f -y
+    rm -f /tmp/alloy.deb
+    systemctl daemon-reload
+  args:
+    creates: /usr/bin/alloy
+
+- name: Create Alloy configuration directory
+  file:
+    path: /etc/alloy
+    state: directory
+    mode: "0755"
+
+- name: Configure Grafana Alloy
+  copy:
+    content: |
+      // Prometheus scrape of local smart-git-proxy /metrics endpoint
+      prometheus.scrape "smart_git_proxy" {
+          targets = [{
+              __address__ = "127.0.0.1:8080",
+          }]
+          metrics_path = "/metrics"
+          scrape_interval = "15s"
+          forward_to = [prometheus.relabel.add_labels.receiver]
+      }
+
+      // Add environment/region labels
+      prometheus.relabel "add_labels" {
+          forward_to = [prometheus.remote_write.self_hosted.receiver]
+
+          rule {
+              target_label = "environment"
+              replacement   = "{{ env }}"
+          }
+          rule {
+              target_label = "region"
+              replacement   = "{{ region }}"
+          }
+          rule {
+              target_label = "job"
+              replacement   = "smart-git-proxy"
+          }
+      }
+
+      // Remote write to self-hosted Prometheus
+      prometheus.remote_write "self_hosted" {
+          endpoint {
+              url = "{{ self_hosted_prometheus_url }}"
+          }
+      }
+    dest: /etc/alloy/config.alloy
+    mode: "0644"
+  register: alloy_config
+
+- name: Enable and restart Alloy
+  systemd:
+    name: alloy
+    state: restarted
+    enabled: yes
+  when: alloy_config.changed
+
+- name: Ensure Alloy is running
+  systemd:
+    name: alloy
+    state: started
+    enabled: yes
diff --git a/ansible/tasks/setup_hostname.yaml b/ansible/tasks/setup_hostname.yaml
new file mode 100644
index 0000000..82d528e
--- /dev/null
+++ b/ansible/tasks/setup_hostname.yaml
@@ -0,0 +1,19 @@
+---
+- name: Set hostname
+  hostname:
+    name: "{{ hostname }}"
+
+- name: Add hostname to /etc/hosts
+  lineinfile:
+    path: /etc/hosts
+    line: "{{ ansible_default_ipv4.address }} {{ hostname }}"
+    regexp: "^{{ ansible_default_ipv4.address }}"
+    state: present
+
+- name: Prevent cloud-init from overwriting hostname
+  copy:
+    content: |
+      #cloud-config
+      preserve_hostname: true
+    dest: /etc/cloud/cloud.cfg.d/99-preserve-hostname.cfg
+    mode: "0644"
diff --git a/ansible/tasks/setup_nvme.yaml b/ansible/tasks/setup_nvme.yaml
new file mode 100644
index 0000000..c9e9e87
--- /dev/null
+++ b/ansible/tasks/setup_nvme.yaml
@@ -0,0 +1,82 @@
+---
+# s4.s2.large machines have 1x 1TB OS + 2x 8TB NVMe data drives.
+# We format and mount one data drive for git mirrors.
+
+- name: Get block device info
+  command: lsblk -J -o NAME,SIZE,TYPE,MOUNTPOINT,FSTYPE
+  register: lsblk_output
+
+- name: Parse block devices
+  set_fact:
+    block_devices: "{{ (lsblk_output.stdout | from_json).blockdevices }}"
+
+- name: Find data NVMe drives (>2TB, no OS partitions)
+  set_fact:
+    data_drives: >-
+      {%- set drives = [] -%}
+      {%- for device in block_devices -%}
+        {%- if device.type == 'disk' -%}
+          {%- set has_os = [] -%}
+          {%- if device.children is defined -%}
+            {%- for child in device.children -%}
+              {%- if child.mountpoint in ['/', '/boot', '/boot/efi'] -%}
+                {%- if has_os.append(true) -%}{%- endif -%}
+              {%- endif -%}
+              {%- if child.children is defined -%}
+                {%- for gc in child.children -%}
+                  {%- if gc.mountpoint in ['/', '/boot', '/boot/efi'] -%}
+                    {%- if has_os.append(true) -%}{%- endif -%}
+                  {%- endif -%}
+                {%- endfor -%}
+              {%- endif -%}
+            {%- endfor -%}
+          {%- endif -%}
+          {%- set size_tb = device.size | regex_replace('[^0-9.]', '') | float -%}
+          {%- if has_os | length == 0 and device.mountpoint is none and size_tb > 2 -%}
+            {%- if drives.append(device.name) -%}{%- endif -%}
+          {%- endif -%}
+        {%- endif -%}
+      {%- endfor -%}
+      {{ drives }}
+
+- name: Display detected data drives
+  debug:
+    msg: "Data NVMe drives: {{ data_drives }}"
+
+- name: Fail if no data drives found
+  fail:
+    msg: "No NVMe data drives >2TB found. Expected s4.s2.large with 2x 8TB NVMe."
+  when: data_drives | length == 0
+
+- name: Check if /mnt/nvme is already mounted
+  command: mountpoint -q /mnt/nvme
+  register: nvme_mounted
+  failed_when: false
+  changed_when: false
+
+- name: Format first data drive as XFS
+  filesystem:
+    fstype: xfs
+    dev: "/dev/{{ data_drives[0] }}"
+    force: no
+  when: nvme_mounted.rc != 0
+
+- name: Create /mnt/nvme mount point
+  file:
+    path: /mnt/nvme
+    state: directory
+    mode: "0755"
+
+- name: Mount NVMe drive
+  mount:
+    path: /mnt/nvme
+    src: "/dev/{{ data_drives[0] }}"
+    fstype: xfs
+    opts: defaults,noatime
+    state: mounted
+
+- name: Create mirrors directory
+  file:
+    path: /mnt/nvme/mirrors
+    state: directory
+    mode: "0755"
diff --git a/ansible/tasks/setup_service.yaml b/ansible/tasks/setup_service.yaml
new file mode 100644
index 0000000..5e630d5
--- /dev/null
+++ b/ansible/tasks/setup_service.yaml
@@ -0,0 +1,33 @@
+---
+- name: Determine environment from inventory
+  set_fact:
+    ENV: "{{ env }}"
+
+- name: Ensure config directory exists
+  file:
+    path: /etc/smart-git-proxy
+    state: directory
+    mode: "0755"
+
+- name: Deploy environment config
+  template:
+    src: templates/smart-git-proxy-env.j2
+    dest: /etc/smart-git-proxy/env
+    mode: "0600"
+
+- name: Deploy systemd unit
+  template:
+    src: templates/smart-git-proxy.service.j2
+    dest: /etc/systemd/system/smart-git-proxy.service
+    mode: "0644"
+  register: systemd_unit
+
+- name: Reload systemd daemon
+  systemd:
+    daemon_reload: yes
+  when: systemd_unit.changed
+
+- name: Enable smart-git-proxy service
+  systemd:
+    name: smart-git-proxy
+    enabled: yes
diff --git a/ansible/tasks/setup_tailscale.yaml b/ansible/tasks/setup_tailscale.yaml
new file mode 100644
index 0000000..0a5a901
--- /dev/null
+++ b/ansible/tasks/setup_tailscale.yaml
@@ -0,0 +1,23 @@
+---
+- name: Include Tailscale role
+  ansible.builtin.include_role:
+    name: artis3n.tailscale
+  vars:
+    tailscale_authkey: "{{ tailscale_auth_key }}"
+    tailscale_args: "--ssh --hostname {{ hostname }}"
+    tailscale_tags: "{{ ['git-proxy'] }}"
+
+- name: Ensure Tailscale is running
+  ansible.builtin.systemd:
+    name: tailscaled
+    state: started
+    enabled: yes
+
+- name: Get Tailscale IP
+  ansible.builtin.command: tailscale ip
+  register: tailscale_ip
+  changed_when: false
+
+- name: Display Tailscale IP
+  ansible.builtin.debug:
+    msg: "Tailscale IP for {{ hostname }}: {{ tailscale_ip.stdout }}"

From b862c45c3441c2bb58a19ae7df920afe900711fa Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 13:28:06 +0000
Subject: [PATCH 08/17] ansible: fix inventory to use Tailscale hostnames as
 keys

- Use Tailscale hostnames (git-proxy-usw-1, etc.) as inventory host keys
  so tailscale ping and Ansible connect via MagicDNS from CI
- Keep ansible_host=<public_ip> for initial hydration (SSH over public IP)
- Fix checkout to use branch input on workflow_dispatch
- Remove hostname inventory var (inventory_hostname is the hostname now)

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 .github/workflows/build-and-deploy.yaml |  2 ++
 ansible/production-usw.ini              | 10 +++++++---
 ansible/staging.ini                     |  6 +++++-
 ansible/tasks/setup_hostname.yaml       |  4 ++--
 ansible/tasks/setup_tailscale.yaml      |  4 ++--
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml
index aaf72f1..bde9dd2 100644
--- a/.github/workflows/build-and-deploy.yaml
+++ b/.github/workflows/build-and-deploy.yaml
@@ -33,6 +33,8 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch || github.ref }}
 
       - name: Set up Go
         uses: actions/setup-go@v5
diff --git a/ansible/production-usw.ini b/ansible/production-usw.ini
index 45462f5..f17ada7 100644
--- a/ansible/production-usw.ini
+++ b/ansible/production-usw.ini
@@ -1,11 +1,15 @@
 # US-West proxy nodes (3 nodes, s4.s2.large, PhoenixNAP, 36mo reservation)
 # Provisioned 2026-06-17
+#
+# Host keys are Tailscale hostnames (resolved via MagicDNS from CI).
+# ansible_host is the public IP (used for initial hydration before Tailscale).
+# After hydration, remove ansible_host lines to connect via Tailscale.
 [all:vars]
 ansible_user=ubuntu
 region=us-west
 env=production
 
 [proxy]
-192.240.240.207 hostname=git-proxy-usw-1 server_id=6a329c5bcd195a90018570da
-192.240.240.208 hostname=git-proxy-usw-2 server_id=6a329c5dcd195a90018570db
-192.240.240.209 hostname=git-proxy-usw-3 server_id=6a329c5ecd195a90018570dc
+git-proxy-usw-1 ansible_host=192.240.240.207 server_id=6a329c5bcd195a90018570da
+git-proxy-usw-2 ansible_host=192.240.240.208 server_id=6a329c5dcd195a90018570db
+git-proxy-usw-3 ansible_host=192.240.240.209 server_id=6a329c5ecd195a90018570dc
diff --git a/ansible/staging.ini b/ansible/staging.ini
index 7258823..a0fd252 100644
--- a/ansible/staging.ini
+++ b/ansible/staging.ini
@@ -1,9 +1,13 @@
 # Staging proxy node (1 node, s4.s2.large, PhoenixNAP, 36mo reservation)
 # Provisioned 2026-06-17
+#
+# Host keys are Tailscale hostnames (resolved via MagicDNS from CI).
+# ansible_host is the public IP (used for initial hydration before Tailscale).
+# After hydration, remove ansible_host lines to connect via Tailscale.
 [all:vars]
 ansible_user=ubuntu
 region=us-west
 env=staging
 
 [proxy]
-192.240.240.210 hostname=git-proxy-staging-1 server_id=6a329cb6cd195a90018570de
+git-proxy-staging-1 ansible_host=192.240.240.210 server_id=6a329cb6cd195a90018570de
diff --git a/ansible/tasks/setup_hostname.yaml b/ansible/tasks/setup_hostname.yaml
index 82d528e..4679323 100644
--- a/ansible/tasks/setup_hostname.yaml
+++ b/ansible/tasks/setup_hostname.yaml
@@ -1,12 +1,12 @@
 ---
 - name: Set hostname
   hostname:
-    name: "{{ hostname }}"
+    name: "{{ inventory_hostname }}"
 
 - name: Add hostname to /etc/hosts
   lineinfile:
     path: /etc/hosts
-    line: "{{ ansible_default_ipv4.address }} {{ hostname }}"
+    line: "{{ ansible_default_ipv4.address }} {{ inventory_hostname }}"
     regexp: "^{{ ansible_default_ipv4.address }}"
     state: present
 
diff --git a/ansible/tasks/setup_tailscale.yaml b/ansible/tasks/setup_tailscale.yaml
index 0a5a901..11ca6d2 100644
--- a/ansible/tasks/setup_tailscale.yaml
+++ b/ansible/tasks/setup_tailscale.yaml
@@ -4,7 +4,7 @@
     name: artis3n.tailscale
   vars:
     tailscale_authkey: "{{ tailscale_auth_key }}"
-    tailscale_args: "--ssh --hostname {{ hostname }}"
+    tailscale_args: "--ssh --hostname {{ inventory_hostname }}"
     tailscale_tags: "{{ ['git-proxy'] }}"
 
 - name: Ensure Tailscale is running
@@ -20,4 +20,4 @@
 
 - name: Display Tailscale IP
   ansible.builtin.debug:
-    msg: "Tailscale IP for {{ hostname }}: {{ tailscale_ip.stdout }}"
+    msg: "Tailscale IP for {{ inventory_hostname }}: {{ tailscale_ip.stdout }}"

From 8d74545e645038da11818b8d546a7ffb9ae5b31b Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 13:31:51 +0000
Subject: [PATCH 09/17] ansible: fix rclone config indentation and add instance
 label to Alloy

- Replace heredoc with printf to avoid YAML indentation leaking into rclone.conf
- Add instance label (inventory_hostname) to Alloy relabel rules so per-node
  dashboards and by(instance) alerts can distinguish hosts

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 .github/workflows/build-and-deploy.yaml | 10 ++--------
 ansible/tasks/setup_grafana_alloy.yaml  |  4 ++++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml
index bde9dd2..f012157 100644
--- a/.github/workflows/build-and-deploy.yaml
+++ b/.github/workflows/build-and-deploy.yaml
@@ -62,14 +62,8 @@ jobs:
           R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
         run: |
           mkdir -p ~/.config/rclone
-          cat > ~/.config/rclone/rclone.conf <<EOF
-          [r2]
-          type = s3
-          provider = Cloudflare
-          access_key_id = $R2_ACCESS_KEY_ID
-          secret_access_key = $R2_SECRET_ACCESS_KEY
-          endpoint = https://1ede90a8395416f286ba9f692dc6bacf.r2.cloudflarestorage.com
-          EOF
+          printf '[r2]\ntype = s3\nprovider = Cloudflare\naccess_key_id = %s\nsecret_access_key = %s\nendpoint = https://1ede90a8395416f286ba9f692dc6bacf.r2.cloudflarestorage.com\n' \
+            "$R2_ACCESS_KEY_ID" "$R2_SECRET_ACCESS_KEY" > ~/.config/rclone/rclone.conf
 
       - name: Push binary to R2
         if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch'
diff --git a/ansible/tasks/setup_grafana_alloy.yaml b/ansible/tasks/setup_grafana_alloy.yaml
index 6965037..bb44c89 100644
--- a/ansible/tasks/setup_grafana_alloy.yaml
+++ b/ansible/tasks/setup_grafana_alloy.yaml
@@ -48,6 +48,10 @@
               target_label = "job"
               replacement   = "smart-git-proxy"
           }
+          rule {
+              target_label = "instance"
+              replacement   = "{{ inventory_hostname }}"
+          }
       }
 
       // Remote write to self-hosted Prometheus

From 1efbbfd421cf25212d60596134ed0d71767e7fc2 Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 13:33:46 +0000
Subject: [PATCH 10/17] ansible: widen Slack failure notification to any step
 failure

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 .github/workflows/build-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml
index f012157..a02d136 100644
--- a/.github/workflows/build-and-deploy.yaml
+++ b/.github/workflows/build-and-deploy.yaml
@@ -173,7 +173,7 @@ jobs:
           ANSIBLE_HOST_KEY_CHECKING: "False"
 
       - name: Send Slack notification on failure
-        if: failure() && steps.ansible-deploy.outcome == 'failure'
+        if: failure()
         uses: slackapi/slack-github-action@v1
         with:
           payload: |

From 2f7fa822d47767b21c3ae6dadfc0625501b697ff Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 14:08:41 +0000
Subject: [PATCH 11/17] ansible: fix requirements.yml - artis3n.tailscale is a
 role not a collection

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/requirements.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ansible/requirements.yml b/ansible/requirements.yml
index 9141d26..2495fbd 100644
--- a/ansible/requirements.yml
+++ b/ansible/requirements.yml
@@ -1,4 +1,4 @@
 ---
-collections:
+roles:
   - name: artis3n.tailscale
-    version: ">=4.0.0"
+    version: "4.5.0"

From d5f614e1746f9a6def02ec1faf43bc49b411380b Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 14:10:46 +0000
Subject: [PATCH 12/17] ansible: fix tailscale dependency - use collection
 format matching fa repo

- Remove version pin (let Galaxy resolve latest)
- Use artis3n.tailscale.machine (collection role) matching fa/infra/ansible pattern

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/requirements.yml           | 3 +--
 ansible/tasks/setup_tailscale.yaml | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/ansible/requirements.yml b/ansible/requirements.yml
index 2495fbd..a0c53c6 100644
--- a/ansible/requirements.yml
+++ b/ansible/requirements.yml
@@ -1,4 +1,3 @@
 ---
-roles:
+collections:
   - name: artis3n.tailscale
-    version: "4.5.0"
diff --git a/ansible/tasks/setup_tailscale.yaml b/ansible/tasks/setup_tailscale.yaml
index 11ca6d2..f885661 100644
--- a/ansible/tasks/setup_tailscale.yaml
+++ b/ansible/tasks/setup_tailscale.yaml
@@ -1,7 +1,7 @@
 ---
 - name: Include Tailscale role
   ansible.builtin.include_role:
-    name: artis3n.tailscale
+    name: artis3n.tailscale.machine
   vars:
     tailscale_authkey: "{{ tailscale_auth_key }}"
     tailscale_args: "--ssh --hostname {{ inventory_hostname }}"

From 66e8587ae29ab34e4eed6faf078d72617b7542e5 Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 15:31:36 +0000
Subject: [PATCH 13/17] ansible: fix vector config - use include_units instead
 of units

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/templates/vector.yaml.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ansible/templates/vector.yaml.j2 b/ansible/templates/vector.yaml.j2
index 5b06e65..de5743b 100644
--- a/ansible/templates/vector.yaml.j2
+++ b/ansible/templates/vector.yaml.j2
@@ -2,7 +2,7 @@
 sources:
   smart_git_proxy_logs:
     type: journald
-    units:
+    include_units:
       - smart-git-proxy
 
 transforms:

From 734002adad3556ad6e2d3badec27173ee8bd9aa7 Mon Sep 17 00:00:00 2001
From: Piotr Bejda <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 11:31:43 -0400
Subject: [PATCH 14/17] ansible: secrets

---
 ansible/secrets.yml | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/ansible/secrets.yml b/ansible/secrets.yml
index f0c5c9c..3dd4a09 100644
--- a/ansible/secrets.yml
+++ b/ansible/secrets.yml
@@ -1,3 +1,12 @@
-# Encrypted with ansible-vault.
-# Contains: github_token, axiom_token, tailscale_auth_key
-# To edit: ansible-vault edit secrets.yml
+$ANSIBLE_VAULT;1.1;AES256
+35386437653735306338376264643236303766323334656130363862353332333838666262316262
+6437376137356362636362316666386436313062376338640a303862636133323861633664626335
+63643534613963636137663533396433343833376266313031386134663166633065343438616537
+3166663338663162340a386163353433663036653236613438366631303431323463633633626265
+64656438313266303662333539313432386163373864323136663137336665383938343932666233
+64346165643261616265343861633163363462366235343639343165366438663364306638396335
+33636238386338366331336662383137376262303061326234323434383835373564346266626532
+34356632303836396662323063323733373031373735373366386235666365346537373837306261
+36323065333933623963633835306535313835353430623365373038646665333666633037343830
+36353135663563346134336361626133663238386532393863316336323932303661666636323161
+663866333462633535396465346166336132

From d33b5a034e99da3ff575335d8fc237bff1896bea Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 15:45:27 +0000
Subject: [PATCH 15/17] ansible: fix VRL merge calls to use infallible merge!()

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/templates/vector.yaml.j2 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ansible/templates/vector.yaml.j2 b/ansible/templates/vector.yaml.j2
index de5743b..e2c698f 100644
--- a/ansible/templates/vector.yaml.j2
+++ b/ansible/templates/vector.yaml.j2
@@ -14,11 +14,11 @@ transforms:
       if exists(.message) {
         parsed, err = parse_json(.message)
         if err == null {
-          . = merge(., parsed)
+          . = merge!(., parsed)
         }
       }
 
-      . = merge(., {
+      . = merge!(., {
         "host": "${HOSTNAME}",
         "environment": "{{ env }}",
         "region": "{{ region }}",

From fce24a5b03e4563d5d926de909f080c53b0f8795 Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 15:57:36 +0000
Subject: [PATCH 16/17] ansible: always restart Vector and reset failed state
 before start

Prevents stale systemd failed state from blocking subsequent runs.

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/tasks/setup_axiom.yaml | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml
index cf3a2f9..ec0be73 100644
--- a/ansible/tasks/setup_axiom.yaml
+++ b/ansible/tasks/setup_axiom.yaml
@@ -43,15 +43,13 @@
     mode: "0600"
   register: vector_config
 
+- name: Reset Vector failed state
+  ansible.builtin.command: systemctl reset-failed vector
+  changed_when: false
+  failed_when: false
+
 - name: Enable and restart Vector
   systemd:
     name: vector
     state: restarted
     enabled: yes
-  when: vector_config.changed or vector_systemd_override.changed
-
-- name: Ensure Vector is running
-  systemd:
-    name: vector
-    state: started
-    enabled: yes

From add8e1595aa82ead0d56df35f78c8d9bff530d7a Mon Sep 17 00:00:00 2001
From: piotr <piotr@blacksmith.sh>
Date: Wed, 17 Jun 2026 16:10:18 +0000
Subject: [PATCH 17/17] ansible: fix vector config ownership - must be readable
 by vector user

Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 ansible/tasks/setup_axiom.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml
index ec0be73..e27c856 100644
--- a/ansible/tasks/setup_axiom.yaml
+++ b/ansible/tasks/setup_axiom.yaml
@@ -40,6 +40,8 @@
   template:
     src: templates/vector.yaml.j2
     dest: /etc/vector/vector.yaml
+    owner: vector
+    group: vector
     mode: "0600"
   register: vector_config