From 418ac8c67caffda718aba841019effabbc07b71d Mon Sep 17 00:00:00 2001 From: piotr Date: Tue, 16 Jun 2026 22:01:05 +0000 Subject: [PATCH 01/17] add deployment and observability scaffold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ansible/: systemd unit, env config, rolling deploy playbook, Vector→Axiom log forwarding, per-region inventory files (3 USW, 2 EUW, 2 EUC, staging) grafana/: dashboard with clone rate, cache hit rate, latency, upstream fetches, NVMe usage, per-repo traffic, errors. Alert rules for node health, NVMe usage, upstream errors, clone latency, eviction rate. Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/README.md | 60 ++++ ansible/ansible.cfg | 3 + ansible/production-euc.ini | 9 + ansible/production-euw.ini | 9 + ansible/production-usw.ini | 10 + ansible/roll.yaml | 95 ++++++ ansible/secrets.yml | 3 + ansible/setup.yaml | 61 ++++ ansible/staging.ini | 8 + ansible/tasks/setup_axiom.yaml | 56 ++++ ansible/templates/smart-git-proxy-env.j2 | 10 + ansible/templates/smart-git-proxy.service.j2 | 23 ++ ansible/templates/vector.yaml.j2 | 37 +++ .../alert-rules/smart-git-proxy-alerts.yaml | 60 ++++ grafana/dashboards/smart-git-proxy.json | 292 ++++++++++++++++++ 15 files changed, 736 insertions(+) create mode 100644 ansible/README.md create mode 100644 ansible/ansible.cfg create mode 100644 ansible/production-euc.ini create mode 100644 ansible/production-euw.ini create mode 100644 ansible/production-usw.ini create mode 100644 ansible/roll.yaml create mode 100644 ansible/secrets.yml create mode 100644 ansible/setup.yaml create mode 100644 ansible/staging.ini create mode 100644 ansible/tasks/setup_axiom.yaml create mode 100644 ansible/templates/smart-git-proxy-env.j2 create mode 100644 ansible/templates/smart-git-proxy.service.j2 create mode 100644 ansible/templates/vector.yaml.j2 create mode 100644 grafana/alert-rules/smart-git-proxy-alerts.yaml create mode 100644 grafana/dashboards/smart-git-proxy.json diff --git a/ansible/README.md b/ansible/README.md new file mode 100644 index 0000000..d9c75a5 --- /dev/null +++ b/ansible/README.md @@ -0,0 +1,60 @@ +# Ansible Deployment + +Playbooks and templates for deploying smart-git-proxy to dedicated proxy nodes. + +## Directory Structure + +``` +ansible/ + setup.yaml # One-time setup: systemd unit, NVMe mount, Vector logging + roll.yaml # Rolling deploy: build or download binary, restart + tasks/ + setup_axiom.yaml # Vector → Axiom log forwarding + templates/ + smart-git-proxy.service.j2 # systemd unit + smart-git-proxy-env.j2 # Environment config + vector.yaml.j2 # Vector config for Axiom + production-usw.ini # US-West inventory (3 nodes) + production-euw.ini # EU-West inventory (2 nodes) + production-euc.ini # EU-Central inventory (2 nodes) + staging.ini # Staging inventory + secrets.yml # ansible-vault encrypted secrets +``` + +## Usage + +### Initial Setup (once per node) + +```bash +ansible-playbook -i production-usw.ini setup.yaml --ask-vault-pass +``` + +### Deploy from Branch + +```bash +ansible-playbook -i production-usw.ini roll.yaml -e branch=main --ask-vault-pass +``` + +### Deploy from Release + +```bash +ansible-playbook -i production-usw.ini roll.yaml -e release_tag=v1.0.0 --ask-vault-pass +``` + +## Configuration + +Key environment variables (set in `templates/smart-git-proxy-env.j2`): + +| Variable | Default | Description | +|----------|---------|-------------| +| `LISTEN_ADDR` | `:8080` | HTTP listen address | +| `MIRROR_DIR` | `/mnt/nvme/mirrors` | Path for bare git mirrors | +| `MIRROR_MAX_SIZE` | `80%` | LRU eviction threshold | +| `SYNC_STALE_AFTER` | `2s` | Upstream sync staleness window | +| `AUTH_MODE` | `pass-through` | Forward client's GitHub token upstream | + +## Secrets + +`secrets.yml` must contain (ansible-vault encrypted): +- `github_token` — GitHub token for cloning the repo during branch builds +- `axiom_token` — Axiom API token for log forwarding diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000..6d2dcd6 --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,3 @@ +[defaults] +host_key_checking = False +timeout = 30 diff --git a/ansible/production-euc.ini b/ansible/production-euc.ini new file mode 100644 index 0000000..5d7ee50 --- /dev/null +++ b/ansible/production-euc.ini @@ -0,0 +1,9 @@ +# EU-Central proxy nodes (2 nodes) +# Update with actual IPs after provisioning. +[all:vars] +region=eu-central +env=production + +[proxy] +# proxy-euc-1 ansible_host= +# proxy-euc-2 ansible_host= diff --git a/ansible/production-euw.ini b/ansible/production-euw.ini new file mode 100644 index 0000000..dde0031 --- /dev/null +++ b/ansible/production-euw.ini @@ -0,0 +1,9 @@ +# EU-West proxy nodes (2 nodes) +# Update with actual IPs after provisioning. +[all:vars] +region=eu-west +env=production + +[proxy] +# proxy-euw-1 ansible_host= +# proxy-euw-2 ansible_host= diff --git a/ansible/production-usw.ini b/ansible/production-usw.ini new file mode 100644 index 0000000..c6bf394 --- /dev/null +++ b/ansible/production-usw.ini @@ -0,0 +1,10 @@ +# US-West proxy nodes (3 nodes) +# Update with actual IPs after provisioning. +[all:vars] +region=us-west +env=production + +[proxy] +# proxy-usw-1 ansible_host= +# proxy-usw-2 ansible_host= +# proxy-usw-3 ansible_host= diff --git a/ansible/roll.yaml b/ansible/roll.yaml new file mode 100644 index 0000000..2f328be --- /dev/null +++ b/ansible/roll.yaml @@ -0,0 +1,95 @@ +--- +# Rolling deploy: build from branch (or download release), install binary, restart. +# Usage: +# ansible-playbook -i production-usw.ini roll.yaml +# ansible-playbook -i production-usw.ini roll.yaml -e branch=feat/my-change + +- name: Roll Smart Git Proxy + hosts: all + become: yes + serial: 1 + vars_files: + - secrets.yml + vars: + branch: "" + release_tag: "" + tasks: + - name: Determine environment from inventory file name + set_fact: + ENV: "{{ 'production' if 'production' in inventory_file else 'staging' if 'staging' in inventory_file else '' }}" + + - name: Ensure ENV is determined + fail: + msg: "Failed to determine environment from inventory file name." + when: ENV == '' + + # --- Branch build path --- + - name: Install Go for branch build + shell: | + if /usr/local/go/bin/go version 2>/dev/null | grep -q 'go1.25'; then + echo "Go already installed" + exit 0 + fi + wget -q https://go.dev/dl/go1.25.0.linux-amd64.tar.gz -O /tmp/go.tar.gz + rm -rf /usr/local/go + tar -C /usr/local -xzf /tmp/go.tar.gz + rm /tmp/go.tar.gz + when: branch != "" + + - name: Clone repo at branch + git: + repo: "https://{{ github_token }}@github.com/useblacksmith/smart-git-proxy.git" + dest: /tmp/smart-git-proxy-build + version: "{{ branch }}" + force: yes + when: branch != "" + + - name: Build from branch + shell: | + set -euo pipefail + export PATH=/usr/local/go/bin:$PATH + cd /tmp/smart-git-proxy-build + make build + args: + executable: /bin/bash + when: branch != "" + + - name: Install branch binary + copy: + src: /tmp/smart-git-proxy-build/bin/smart-git-proxy + dest: /usr/local/bin/smart-git-proxy + mode: "0755" + remote_src: yes + when: branch != "" + + - name: Clean up build directory + file: + path: /tmp/smart-git-proxy-build + state: absent + when: branch != "" + + # --- Release download path --- + - name: Download release binary + get_url: + url: "https://github.com/useblacksmith/smart-git-proxy/releases/download/{{ release_tag }}/smart-git-proxy_linux_amd64" + dest: /usr/local/bin/smart-git-proxy + mode: "0755" + force: yes + when: branch == "" and release_tag != "" + + # --- Restart --- + - name: Restart smart-git-proxy + systemd: + name: smart-git-proxy + state: restarted + enabled: yes + + - name: Wait for health check + uri: + url: http://localhost:8080/healthz + status_code: 200 + timeout: 5 + register: health + retries: 10 + delay: 2 + until: health.status == 200 diff --git a/ansible/secrets.yml b/ansible/secrets.yml new file mode 100644 index 0000000..e1ed311 --- /dev/null +++ b/ansible/secrets.yml @@ -0,0 +1,3 @@ +# Encrypted with ansible-vault. +# Contains: github_token, axiom_token +# To edit: ansible-vault edit secrets.yml diff --git a/ansible/setup.yaml b/ansible/setup.yaml new file mode 100644 index 0000000..890788f --- /dev/null +++ b/ansible/setup.yaml @@ -0,0 +1,61 @@ +--- +# One-time setup: deploy systemd unit, env config, NVMe mount, Vector logging. +# Does not start the service -- the roll playbook handles that. + +- name: Setup Smart Git Proxy + hosts: all + become: yes + vars_files: + - secrets.yml + tasks: + - name: Determine environment from inventory file name + set_fact: + ENV: "{{ 'production' if 'production' in inventory_file else 'staging' if 'staging' in inventory_file else '' }}" + + - name: Ensure ENV is determined + fail: + msg: "Failed to determine environment from inventory file name." + when: ENV == '' + + # --- NVMe storage --- + - name: Check if NVMe mirror directory exists + stat: + path: /mnt/nvme + register: nvme_mount + + - name: Create mirror directory + file: + path: "{{ mirror_dir | default('/mnt/nvme/mirrors') }}" + state: directory + mode: "0755" + when: nvme_mount.stat.exists + + # --- Config --- + - name: Ensure config directory exists + file: + path: /etc/smart-git-proxy + state: directory + mode: "0755" + + - name: Deploy environment config + template: + src: templates/smart-git-proxy-env.j2 + dest: /etc/smart-git-proxy/env + mode: "0600" + + # --- systemd --- + - name: Deploy systemd unit + template: + src: templates/smart-git-proxy.service.j2 + dest: /etc/systemd/system/smart-git-proxy.service + mode: "0644" + register: systemd_unit + + - name: Reload systemd daemon + systemd: + daemon_reload: yes + when: systemd_unit.changed + + # --- Axiom log forwarding --- + - name: Setup Axiom log forwarding + include_tasks: tasks/setup_axiom.yaml diff --git a/ansible/staging.ini b/ansible/staging.ini new file mode 100644 index 0000000..b5eb016 --- /dev/null +++ b/ansible/staging.ini @@ -0,0 +1,8 @@ +# Staging proxy nodes (1 node per region or shared) +# Update with actual IPs after provisioning. +[all:vars] +region=us-west +env=staging + +[proxy] +# proxy-staging-1 ansible_host= diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml new file mode 100644 index 0000000..4d98135 --- /dev/null +++ b/ansible/tasks/setup_axiom.yaml @@ -0,0 +1,56 @@ +--- +- name: Add Vector repository + ansible.builtin.shell: | + bash -c "$(curl -L https://setup.vector.dev)" + args: + creates: /usr/share/keyrings/vector-archive-keyring.gpg + +- name: Install Vector + ansible.builtin.apt: + name: vector + state: present + +- name: Get hostname + shell: hostname + register: actual_hostname + changed_when: false + +- name: Create systemd override directory for Vector + file: + path: /etc/systemd/system/vector.service.d + state: directory + mode: "0755" + +- name: Configure Vector systemd override + copy: + content: | + [Service] + Environment="HOSTNAME={{ actual_hostname.stdout }}" + dest: /etc/systemd/system/vector.service.d/override.conf + mode: "0644" + register: vector_systemd_override + +- name: Reload systemd if Vector override changed + systemd: + daemon_reload: yes + when: vector_systemd_override.changed + +- name: Deploy Vector config + template: + src: templates/vector.yaml.j2 + dest: /etc/vector/vector.yaml + mode: "0644" + register: vector_config + +- name: Enable and restart Vector + systemd: + name: vector + state: restarted + enabled: yes + when: vector_config.changed or vector_systemd_override.changed + +- name: Ensure Vector is running + systemd: + name: vector + state: started + enabled: yes diff --git a/ansible/templates/smart-git-proxy-env.j2 b/ansible/templates/smart-git-proxy-env.j2 new file mode 100644 index 0000000..457b4ee --- /dev/null +++ b/ansible/templates/smart-git-proxy-env.j2 @@ -0,0 +1,10 @@ +# Smart Git Proxy configuration +# See https://github.com/useblacksmith/smart-git-proxy#configuration + +LISTEN_ADDR=:8080 +MIRROR_DIR={{ mirror_dir | default('/mnt/nvme/mirrors') }} +MIRROR_MAX_SIZE={{ mirror_max_size | default('80%') }} +SYNC_STALE_AFTER={{ sync_stale_after | default('2s') }} +ALLOWED_UPSTREAMS=github.com +AUTH_MODE=pass-through +LOG_LEVEL={{ log_level | default('info') }} diff --git a/ansible/templates/smart-git-proxy.service.j2 b/ansible/templates/smart-git-proxy.service.j2 new file mode 100644 index 0000000..dae6118 --- /dev/null +++ b/ansible/templates/smart-git-proxy.service.j2 @@ -0,0 +1,23 @@ +[Unit] +Description=Smart Git Proxy +After=network-online.target +Wants=network-online.target +StartLimitIntervalSec=300 +StartLimitBurst=10 + +[Service] +Type=exec +Environment="HOME=/root" +EnvironmentFile=/etc/smart-git-proxy/env +ExecStart=/usr/local/bin/smart-git-proxy +Restart=on-failure +RestartSec=10 +TimeoutStopSec=30 +KillMode=control-group +KillSignal=SIGTERM +StandardOutput=journal +StandardError=journal +SyslogIdentifier=smart-git-proxy + +[Install] +WantedBy=multi-user.target diff --git a/ansible/templates/vector.yaml.j2 b/ansible/templates/vector.yaml.j2 new file mode 100644 index 0000000..5b06e65 --- /dev/null +++ b/ansible/templates/vector.yaml.j2 @@ -0,0 +1,37 @@ +# Vector config for shipping smart-git-proxy logs to Axiom. +sources: + smart_git_proxy_logs: + type: journald + units: + - smart-git-proxy + +transforms: + add_metadata: + type: remap + inputs: + - smart_git_proxy_logs + source: | + if exists(.message) { + parsed, err = parse_json(.message) + if err == null { + . = merge(., parsed) + } + } + + . = merge(., { + "host": "${HOSTNAME}", + "environment": "{{ env }}", + "region": "{{ region }}", + "service_name": "smart-git-proxy" + }) + +sinks: + axiom: + type: axiom + inputs: + - add_metadata + token: "{{ axiom_token }}" + dataset: smart-git-proxy + batch: + max_bytes: 1049000 + timeout_secs: 1 diff --git a/grafana/alert-rules/smart-git-proxy-alerts.yaml b/grafana/alert-rules/smart-git-proxy-alerts.yaml new file mode 100644 index 0000000..d4224fb --- /dev/null +++ b/grafana/alert-rules/smart-git-proxy-alerts.yaml @@ -0,0 +1,60 @@ +# Grafana alert rules for Smart Git Proxy. +# Import into Grafana via Alerting > Alert rules > Import. + +groups: + - name: smart-git-proxy + interval: 1m + rules: + - alert: ProxyNodeDown + expr: up{job="smart-git-proxy"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Smart git proxy node {{ $labels.instance }} is down" + description: "Health endpoint unreachable for 2 minutes." + + - alert: NVMeUsageHigh + expr: smart_git_proxy_disk_usage_ratio > 0.80 + for: 5m + labels: + severity: warning + annotations: + summary: "NVMe usage >80% on {{ $labels.instance }}" + description: "Mirror storage at {{ $value | humanizePercentage }}. LRU eviction should handle this, but may indicate undersized disk or eviction failure." + + - alert: NVMeUsageCritical + expr: smart_git_proxy_disk_usage_ratio > 0.95 + for: 2m + labels: + severity: critical + annotations: + summary: "NVMe usage >95% on {{ $labels.instance }}" + description: "Mirror storage nearly full. LRU eviction may be failing." + + - alert: UpstreamFetchErrorsHigh + expr: sum(rate(smart_git_proxy_upstream_fetch_errors_total[5m])) by (instance) > 0.5 + for: 5m + labels: + severity: warning + annotations: + summary: "Elevated upstream fetch errors on {{ $labels.instance }}" + description: "Failing to fetch from GitHub at {{ $value | humanize }}/sec. May indicate GitHub rate limiting or network issues." + + - alert: CloneLatencyHigh + expr: histogram_quantile(0.95, sum(rate(smart_git_proxy_clone_duration_seconds_bucket[5m])) by (le, instance)) > 30 + for: 5m + labels: + severity: warning + annotations: + summary: "p95 clone latency >30s on {{ $labels.instance }}" + description: "Slow git clone serving — may indicate NVMe I/O pressure or excessive concurrent requests." + + - alert: HighEvictionRate + expr: sum(rate(smart_git_proxy_evictions_total[5m])) by (instance) > 1 + for: 10m + labels: + severity: warning + annotations: + summary: "High LRU eviction rate on {{ $labels.instance }}" + description: "Evicting >1 mirror/sec sustained. May indicate insufficient NVMe capacity for the working set." diff --git a/grafana/dashboards/smart-git-proxy.json b/grafana/dashboards/smart-git-proxy.json new file mode 100644 index 0000000..f3c1fb9 --- /dev/null +++ b/grafana/dashboards/smart-git-proxy.json @@ -0,0 +1,292 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Clone Requests / sec", + "description": "Rate of git clone/fetch requests served by the proxy.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, + "id": 1, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Cache Hit Rate", + "description": "Percentage of requests served from warm local mirror (no upstream GitHub fetch).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, + "id": 2, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_cache_hits_total{instance=~\"$instance\"}[$__rate_interval])) / sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) * 100", + "legendFormat": "hit rate %" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Clone Latency (p50 / p95 / p99)", + "description": "Time to serve a git clone/fetch request from local NVMe mirror.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, + "id": 3, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Upstream GitHub Fetches / sec", + "description": "Rate of upstream fetches to GitHub (should be low if mirrors are warm).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, + "id": 4, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_upstream_fetches_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }, + "id": 101, + "title": "Storage", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Mirror Count", + "description": "Number of bare git mirrors currently on disk.", + "type": "stat", + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 18 }, + "id": 5, + "targets": [ + { + "expr": "sum(smart_git_proxy_mirrors_total{instance=~\"$instance\"}) by (instance)", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "short" } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "NVMe Usage %", + "description": "Disk usage percentage of the mirror storage directory.", + "type": "gauge", + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 18 }, + "id": 6, + "targets": [ + { + "expr": "smart_git_proxy_disk_usage_ratio{instance=~\"$instance\"} * 100", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 60 }, + { "color": "red", "value": 80 } + ] + } + } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "NVMe Usage (bytes)", + "description": "Absolute disk usage over time for the mirror directory.", + "type": "timeseries", + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 18 }, + "id": 7, + "targets": [ + { + "expr": "smart_git_proxy_disk_usage_bytes{instance=~\"$instance\"}", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }, + "id": 102, + "title": "Per-Repo Traffic", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Top Repos by Request Rate", + "description": "Clone/fetch request rate broken down by repository.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 25 }, + "id": 8, + "targets": [ + { + "expr": "topk(10, sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (repo))", + "legendFormat": "{{ repo }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }, + "id": 103, + "title": "Errors", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Upstream Fetch Errors / sec", + "description": "Failed upstream GitHub fetches.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }, + "id": 9, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_upstream_fetch_errors_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "LRU Evictions / sec", + "description": "Rate of mirrors evicted to free disk space.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }, + "id": 10, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_evictions_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + } + ], + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "current": { "text": "Prometheus", "value": "PBFA97CFB590B2093" } + }, + { + "name": "instance", + "type": "query", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "query": "label_values(smart_git_proxy_clone_requests_total, instance)", + "includeAll": true, + "allValue": ".*", + "multi": true, + "current": { "text": "All", "value": "$__all" } + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "utc", + "title": "Smart Git Proxy", + "uid": "smart-git-proxy", + "version": 1, + "schemaVersion": 36 +} From 7e28b4043ff6db59e95f03a65d0e53ca7b59d301 Mon Sep 17 00:00:00 2001 From: piotr Date: Tue, 16 Jun 2026 22:04:41 +0000 Subject: [PATCH 02/17] fix grafana metrics, nvme setup, and release download - grafana: align metric names with internal/metrics/metrics.go (requests_total, request_seconds, sync_total, errors_total, responses_total) instead of invented names - ansible/setup: fail explicitly when /mnt/nvme is missing instead of silently skipping mirror directory creation - ansible/roll: download goreleaser tar.gz archive and extract, matching the actual release asset naming convention Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/roll.yaml | 19 ++- ansible/setup.yaml | 8 +- .../alert-rules/smart-git-proxy-alerts.yaml | 49 +++--- grafana/dashboards/smart-git-proxy.json | 150 +++++------------- 4 files changed, 78 insertions(+), 148 deletions(-) diff --git a/ansible/roll.yaml b/ansible/roll.yaml index 2f328be..4b38278 100644 --- a/ansible/roll.yaml +++ b/ansible/roll.yaml @@ -69,14 +69,25 @@ when: branch != "" # --- Release download path --- - - name: Download release binary + # Goreleaser publishes archives as: smart-git-proxy__linux_amd64.tar.gz + - name: Download release archive get_url: - url: "https://github.com/useblacksmith/smart-git-proxy/releases/download/{{ release_tag }}/smart-git-proxy_linux_amd64" - dest: /usr/local/bin/smart-git-proxy - mode: "0755" + url: "https://github.com/useblacksmith/smart-git-proxy/releases/download/{{ release_tag }}/smart-git-proxy_{{ release_tag | regex_replace('^v', '') }}_linux_amd64.tar.gz" + dest: /tmp/smart-git-proxy-release.tar.gz force: yes when: branch == "" and release_tag != "" + - name: Extract release binary + shell: | + set -euo pipefail + mkdir -p /tmp/smart-git-proxy-release + tar -xzf /tmp/smart-git-proxy-release.tar.gz -C /tmp/smart-git-proxy-release + install -m 0755 /tmp/smart-git-proxy-release/smart-git-proxy /usr/local/bin/smart-git-proxy + rm -rf /tmp/smart-git-proxy-release /tmp/smart-git-proxy-release.tar.gz + args: + executable: /bin/bash + when: branch == "" and release_tag != "" + # --- Restart --- - name: Restart smart-git-proxy systemd: diff --git a/ansible/setup.yaml b/ansible/setup.yaml index 890788f..e86e001 100644 --- a/ansible/setup.yaml +++ b/ansible/setup.yaml @@ -18,17 +18,21 @@ when: ENV == '' # --- NVMe storage --- - - name: Check if NVMe mirror directory exists + - name: Verify NVMe mount exists stat: path: /mnt/nvme register: nvme_mount + - name: Fail if NVMe mount is missing + fail: + msg: "/mnt/nvme does not exist. Proxy nodes require NVMe storage for git mirrors." + when: not nvme_mount.stat.exists + - name: Create mirror directory file: path: "{{ mirror_dir | default('/mnt/nvme/mirrors') }}" state: directory mode: "0755" - when: nvme_mount.stat.exists # --- Config --- - name: Ensure config directory exists diff --git a/grafana/alert-rules/smart-git-proxy-alerts.yaml b/grafana/alert-rules/smart-git-proxy-alerts.yaml index d4224fb..441e82f 100644 --- a/grafana/alert-rules/smart-git-proxy-alerts.yaml +++ b/grafana/alert-rules/smart-git-proxy-alerts.yaml @@ -1,5 +1,12 @@ # Grafana alert rules for Smart Git Proxy. # Import into Grafana via Alerting > Alert rules > Import. +# +# Metric names match internal/metrics/metrics.go: +# smart_git_proxy_requests_total (repo, kind, source) +# smart_git_proxy_responses_total (repo, kind, status) +# smart_git_proxy_errors_total (repo, kind) +# smart_git_proxy_request_seconds (repo, kind) [histogram] +# smart_git_proxy_sync_total (repo, result) groups: - name: smart-git-proxy @@ -14,47 +21,29 @@ groups: summary: "Smart git proxy node {{ $labels.instance }} is down" description: "Health endpoint unreachable for 2 minutes." - - alert: NVMeUsageHigh - expr: smart_git_proxy_disk_usage_ratio > 0.80 + - alert: ErrorRateHigh + expr: sum(rate(smart_git_proxy_errors_total[5m])) by (instance) > 0.5 for: 5m labels: severity: warning annotations: - summary: "NVMe usage >80% on {{ $labels.instance }}" - description: "Mirror storage at {{ $value | humanizePercentage }}. LRU eviction should handle this, but may indicate undersized disk or eviction failure." + summary: "Elevated error rate on {{ $labels.instance }}" + description: "Errors at {{ $value | humanize }}/sec. May indicate upstream GitHub issues or local git failures." - - alert: NVMeUsageCritical - expr: smart_git_proxy_disk_usage_ratio > 0.95 - for: 2m - labels: - severity: critical - annotations: - summary: "NVMe usage >95% on {{ $labels.instance }}" - description: "Mirror storage nearly full. LRU eviction may be failing." - - - alert: UpstreamFetchErrorsHigh - expr: sum(rate(smart_git_proxy_upstream_fetch_errors_total[5m])) by (instance) > 0.5 + - alert: RequestLatencyHigh + expr: histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket[5m])) by (le, instance)) > 30 for: 5m labels: severity: warning annotations: - summary: "Elevated upstream fetch errors on {{ $labels.instance }}" - description: "Failing to fetch from GitHub at {{ $value | humanize }}/sec. May indicate GitHub rate limiting or network issues." + summary: "p95 request latency >30s on {{ $labels.instance }}" + description: "Slow git request serving — may indicate NVMe I/O pressure or excessive concurrent requests." - - alert: CloneLatencyHigh - expr: histogram_quantile(0.95, sum(rate(smart_git_proxy_clone_duration_seconds_bucket[5m])) by (le, instance)) > 30 + - alert: SyncFailureRateHigh + expr: sum(rate(smart_git_proxy_sync_total{result="error"}[5m])) by (instance) > 0.5 for: 5m labels: severity: warning annotations: - summary: "p95 clone latency >30s on {{ $labels.instance }}" - description: "Slow git clone serving — may indicate NVMe I/O pressure or excessive concurrent requests." - - - alert: HighEvictionRate - expr: sum(rate(smart_git_proxy_evictions_total[5m])) by (instance) > 1 - for: 10m - labels: - severity: warning - annotations: - summary: "High LRU eviction rate on {{ $labels.instance }}" - description: "Evicting >1 mirror/sec sustained. May indicate insufficient NVMe capacity for the working set." + summary: "Elevated sync failures on {{ $labels.instance }}" + description: "Upstream GitHub sync failures at {{ $value | humanize }}/sec. May indicate rate limiting or network issues." diff --git a/grafana/dashboards/smart-git-proxy.json b/grafana/dashboards/smart-git-proxy.json index f3c1fb9..728d146 100644 --- a/grafana/dashboards/smart-git-proxy.json +++ b/grafana/dashboards/smart-git-proxy.json @@ -27,14 +27,14 @@ }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "title": "Clone Requests / sec", - "description": "Rate of git clone/fetch requests served by the proxy.", + "title": "Requests / sec", + "description": "Rate of git requests served by the proxy (all kinds: info-refs, upload-pack).", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, "id": 1, "targets": [ { - "expr": "sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(smart_git_proxy_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", "legendFormat": "{{ instance }}" } ], @@ -47,44 +47,42 @@ }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "title": "Cache Hit Rate", - "description": "Percentage of requests served from warm local mirror (no upstream GitHub fetch).", + "title": "Requests by Kind", + "description": "Request rate broken down by kind (info-refs, upload-pack).", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, "id": 2, "targets": [ { - "expr": "sum(rate(smart_git_proxy_cache_hits_total{instance=~\"$instance\"}[$__rate_interval])) / sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) * 100", - "legendFormat": "hit rate %" + "expr": "sum(rate(smart_git_proxy_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (kind)", + "legendFormat": "{{ kind }}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "min": 0, - "max": 100, + "unit": "reqps", "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } } } }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "title": "Clone Latency (p50 / p95 / p99)", - "description": "Time to serve a git clone/fetch request from local NVMe mirror.", + "title": "Request Latency (p50 / p95 / p99)", + "description": "Time to serve a git request from local NVMe mirror.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "id": 3, "targets": [ { - "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", + "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", "legendFormat": "p50" }, { - "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", "legendFormat": "p95" }, { - "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_clone_duration_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", "legendFormat": "p99" } ], @@ -97,15 +95,15 @@ }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "title": "Upstream GitHub Fetches / sec", - "description": "Rate of upstream fetches to GitHub (should be low if mirrors are warm).", + "title": "Mirror Syncs / sec", + "description": "Rate of mirror sync operations (upstream fetches from GitHub), by result.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "id": 4, "targets": [ { - "expr": "sum(rate(smart_git_proxy_upstream_fetches_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", - "legendFormat": "{{ instance }}" + "expr": "sum(rate(smart_git_proxy_sync_total{instance=~\"$instance\"}[$__rate_interval])) by (result)", + "legendFormat": "{{ result }}" } ], "fieldConfig": { @@ -119,77 +117,52 @@ "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }, "id": 101, - "title": "Storage", + "title": "Responses & Errors", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "title": "Mirror Count", - "description": "Number of bare git mirrors currently on disk.", - "type": "stat", - "gridPos": { "h": 6, "w": 6, "x": 0, "y": 18 }, + "title": "Response Status", + "description": "Responses by HTTP status code.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, "id": 5, "targets": [ { - "expr": "sum(smart_git_proxy_mirrors_total{instance=~\"$instance\"}) by (instance)", - "legendFormat": "{{ instance }}" - } - ], - "fieldConfig": { - "defaults": { "unit": "short" } - } - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "title": "NVMe Usage %", - "description": "Disk usage percentage of the mirror storage directory.", - "type": "gauge", - "gridPos": { "h": 6, "w": 6, "x": 6, "y": 18 }, - "id": 6, - "targets": [ - { - "expr": "smart_git_proxy_disk_usage_ratio{instance=~\"$instance\"} * 100", - "legendFormat": "{{ instance }}" + "expr": "sum(rate(smart_git_proxy_responses_total{instance=~\"$instance\"}[$__rate_interval])) by (status)", + "legendFormat": "{{ status }}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "thresholds": { - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 60 }, - { "color": "red", "value": 80 } - ] - } + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } } } }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "title": "NVMe Usage (bytes)", - "description": "Absolute disk usage over time for the mirror directory.", + "title": "Errors / sec", + "description": "Error rate by repo and kind.", "type": "timeseries", - "gridPos": { "h": 6, "w": 12, "x": 12, "y": 18 }, - "id": 7, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }, + "id": 6, "targets": [ { - "expr": "smart_git_proxy_disk_usage_bytes{instance=~\"$instance\"}", + "expr": "sum(rate(smart_git_proxy_errors_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", "legendFormat": "{{ instance }}" } ], "fieldConfig": { "defaults": { - "unit": "bytes", + "unit": "reqps", "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } } } }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, "id": 102, "title": "Per-Repo Traffic", "type": "row" @@ -197,13 +170,13 @@ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "title": "Top Repos by Request Rate", - "description": "Clone/fetch request rate broken down by repository.", + "description": "Request rate broken down by repository.", "type": "timeseries", - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 25 }, - "id": 8, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 27 }, + "id": 7, "targets": [ { - "expr": "topk(10, sum(rate(smart_git_proxy_clone_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (repo))", + "expr": "topk(10, sum(rate(smart_git_proxy_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (repo))", "legendFormat": "{{ repo }}" } ], @@ -213,53 +186,6 @@ "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } } } - }, - { - "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }, - "id": 103, - "title": "Errors", - "type": "row" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "title": "Upstream Fetch Errors / sec", - "description": "Failed upstream GitHub fetches.", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }, - "id": 9, - "targets": [ - { - "expr": "sum(rate(smart_git_proxy_upstream_fetch_errors_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", - "legendFormat": "{{ instance }}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "reqps", - "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } - } - } - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "title": "LRU Evictions / sec", - "description": "Rate of mirrors evicted to free disk space.", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }, - "id": 10, - "targets": [ - { - "expr": "sum(rate(smart_git_proxy_evictions_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", - "legendFormat": "{{ instance }}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "reqps", - "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } - } - } } ], "templating": { @@ -274,7 +200,7 @@ "name": "instance", "type": "query", "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "query": "label_values(smart_git_proxy_clone_requests_total, instance)", + "query": "label_values(smart_git_proxy_requests_total, instance)", "includeAll": true, "allValue": ".*", "multi": true, From aa31b37817742795233f74c3ba6a142aaba4fd4b Mon Sep 17 00:00:00 2001 From: piotr Date: Tue, 16 Jun 2026 22:07:28 +0000 Subject: [PATCH 03/17] fix review feedback: GO=go override, deploy guard, histogram grouping, vector perms - roll.yaml: export GO=go before make build (Makefile defaults to mise exec -- go which isn't on deploy targets) - roll.yaml: fail early if neither branch nor release_tag provided, preventing needless service restart - grafana: group histogram_quantile by (le, instance) to avoid merging latencies across nodes - setup_axiom: tighten vector.yaml to 0600 (contains axiom_token) Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/roll.yaml | 7 +++++++ ansible/tasks/setup_axiom.yaml | 2 +- grafana/dashboards/smart-git-proxy.json | 12 ++++++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/ansible/roll.yaml b/ansible/roll.yaml index 4b38278..8fe2186 100644 --- a/ansible/roll.yaml +++ b/ansible/roll.yaml @@ -48,6 +48,7 @@ shell: | set -euo pipefail export PATH=/usr/local/go/bin:$PATH + export GO=go cd /tmp/smart-git-proxy-build make build args: @@ -88,6 +89,12 @@ executable: /bin/bash when: branch == "" and release_tag != "" + # --- Validate deploy target --- + - name: Fail if neither branch nor release_tag provided + fail: + msg: "Must specify either -e branch= or -e release_tag=" + when: branch == "" and release_tag == "" + # --- Restart --- - name: Restart smart-git-proxy systemd: diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml index 4d98135..6a3e2f1 100644 --- a/ansible/tasks/setup_axiom.yaml +++ b/ansible/tasks/setup_axiom.yaml @@ -39,7 +39,7 @@ template: src: templates/vector.yaml.j2 dest: /etc/vector/vector.yaml - mode: "0644" + mode: "0600" register: vector_config - name: Enable and restart Vector diff --git a/grafana/dashboards/smart-git-proxy.json b/grafana/dashboards/smart-git-proxy.json index 728d146..fdfae93 100644 --- a/grafana/dashboards/smart-git-proxy.json +++ b/grafana/dashboards/smart-git-proxy.json @@ -74,16 +74,16 @@ "id": 3, "targets": [ { - "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", - "legendFormat": "p50" + "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le, instance))", + "legendFormat": "{{ instance }} p50" }, { - "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", - "legendFormat": "p95" + "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le, instance))", + "legendFormat": "{{ instance }} p95" }, { - "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le))", - "legendFormat": "p99" + "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le, instance))", + "legendFormat": "{{ instance }} p99" } ], "fieldConfig": { From 666b88e02150d1e2ecddf096225fb3c1653a37bf Mon Sep 17 00:00:00 2001 From: piotr Date: Tue, 16 Jun 2026 22:34:06 +0000 Subject: [PATCH 04/17] ansible: add update_cache to vector apt install Ensures apt cache is refreshed after adding the Vector repo so the package is discoverable on first run. Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/tasks/setup_axiom.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml index 6a3e2f1..cf3a2f9 100644 --- a/ansible/tasks/setup_axiom.yaml +++ b/ansible/tasks/setup_axiom.yaml @@ -9,6 +9,7 @@ ansible.builtin.apt: name: vector state: present + update_cache: yes - name: Get hostname shell: hostname From 2fdf83acc8ef4bdc62b6f2915a30dca85cbca0d1 Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 12:53:07 +0000 Subject: [PATCH 05/17] add CI workflows for build+deploy and grafana dashboard sync - build-and-deploy.yaml: build Go binary, push to R2, deploy via Ansible+Tailscale (same pattern as storage-agent) - deploy-grafana-dashboards.yml: auto-deploy dashboards and alert rules to self-hosted Grafana on push to main/production Required secrets: R2_ACCESS_KEY_ID, R2_SECRET_ACCESS_KEY, TS_OAUTH_CLIENT_ID, TS_OAUTH_SECRET, ANSIBLE_SECRET, SELF_HOSTED_GRAFANA_URL, SELF_HOSTED_GRAFANA_USER, SELF_HOSTED_GRAFANA_PASSWORD Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .github/workflows/build-and-deploy.yaml | 176 ++++++++++++++ .../workflows/deploy-grafana-dashboards.yml | 229 ++++++++++++++++++ 2 files changed, 405 insertions(+) create mode 100644 .github/workflows/build-and-deploy.yaml create mode 100644 .github/workflows/deploy-grafana-dashboards.yml diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml new file mode 100644 index 0000000..9431e86 --- /dev/null +++ b/.github/workflows/build-and-deploy.yaml @@ -0,0 +1,176 @@ +name: Build and Deploy + +on: + push: + branches: + - main + - production + pull_request: + workflow_dispatch: + inputs: + branch: + description: "Branch to deploy (main/production)" + required: true + type: choice + options: + - main + - production + default: "main" + run_deployment: + description: "Run ansible deployment after build" + required: true + type: boolean + default: false + +concurrency: + group: "build-and-deploy-${{ github.ref == 'refs/heads/production' && 'production' || github.event.inputs.branch == 'production' && 'production' || 'staging' }}" + cancel-in-progress: false + +jobs: + build-and-deploy: + runs-on: blacksmith-8vcpu-ubuntu-2204 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Build binary + run: | + export GO=go + make build + cp bin/smart-git-proxy ./smart-git-proxy + + - name: Run tests + if: github.event_name == 'pull_request' + run: go test ./... + + - name: Install rclone + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch' + run: curl https://rclone.org/install.sh | sudo bash + + - name: Configure rclone + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch' + env: + R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} + R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} + run: | + mkdir -p ~/.config/rclone + cat > ~/.config/rclone/rclone.conf <> $GITHUB_ENV + if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then + rclone copy ./smart-git-proxy r2:useblacksmith/smart-git-proxy/production/$SHA + else + rclone copy ./smart-git-proxy r2:useblacksmith/smart-git-proxy/main/$SHA + fi + + - name: Set up Python + if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Install Ansible + if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + run: | + python -m pip install --upgrade pip + pip install ansible + + - name: Connect to Tailscale + if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + uses: tailscale/github-action@v3 + with: + oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }} + oauth-secret: ${{ secrets.TS_OAUTH_SECRET }} + tags: tag:ci + + - name: Create Ansible Vault password file + if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + run: echo "${{ secrets.ANSIBLE_SECRET }}" > ~/vault-password.txt + + - name: Check host connectivity + if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + working-directory: ansible + run: | + sleep 10 + if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then + INVENTORY_FILES="production-usw.ini production-euw.ini production-euc.ini" + else + INVENTORY_FILES="staging.ini" + fi + for INVENTORY_FILE in $INVENTORY_FILES; do + echo "=== Checking hosts in $INVENTORY_FILE ===" + HOSTS=$(ansible-inventory -i $INVENTORY_FILE --list | jq -r '._meta.hostvars | keys[]') + if [ -z "$HOSTS" ]; then + echo "warning: no hosts found in $INVENTORY_FILE, skipping" + continue + fi + for host in $HOSTS; do + echo "Testing connectivity to $host..." + start_time=$(date +%s) + while true; do + if tailscale ping -c 1 --timeout=5s $host >/dev/null 2>&1; then + echo "$host is reachable" + break + fi + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + if [ $elapsed -ge 30 ]; then + echo "error: timeout after 30s waiting for $host" + exit 1 + fi + echo "Waiting for $host... (${elapsed}s elapsed)" + sleep 5 + done + done + done + env: + ANSIBLE_HOST_KEY_CHECKING: "False" + + - name: Run Ansible rolling deploy + id: ansible-deploy + if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + working-directory: ansible + run: | + BRANCH_ARG="-e branch=${{ github.ref_name }}" + if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then + for REGION_INI in production-usw.ini production-euw.ini production-euc.ini; do + echo "=== Rolling region: $REGION_INI ===" + ANSIBLE_CONFIG=./ansible.cfg ansible-playbook -i "$REGION_INI" \ + --vault-password-file ~/vault-password.txt \ + roll.yaml $BRANCH_ARG -v + done + else + ANSIBLE_CONFIG=./ansible.cfg ansible-playbook -i staging.ini \ + --vault-password-file ~/vault-password.txt \ + roll.yaml $BRANCH_ARG -v + fi + env: + ANSIBLE_HOST_KEY_CHECKING: "False" + + - name: Send Slack notification on failure + if: failure() && steps.ansible-deploy.outcome == 'failure' + uses: slackapi/slack-github-action@v1 + with: + payload: | + { + "text": "Ansible deploy failed for smart-git-proxy! Branch: ${{ github.ref_name || github.event.inputs.branch }}, Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + } + env: + SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/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" diff --git a/.github/workflows/deploy-grafana-dashboards.yml b/.github/workflows/deploy-grafana-dashboards.yml new file mode 100644 index 0000000..a207aec --- /dev/null +++ b/.github/workflows/deploy-grafana-dashboards.yml @@ -0,0 +1,229 @@ +name: Deploy Dashboards & Alerts to Self-Hosted Grafana + +on: + push: + branches: [main, production] + paths: + - "grafana/dashboards/*.json" + - "grafana/alert-rules/*.yaml" + - "grafana/alert-rules/*.yml" + - ".github/workflows/deploy-grafana-dashboards.yml" + pull_request: + paths: + - "grafana/dashboards/*.json" + - "grafana/alert-rules/*.yaml" + - "grafana/alert-rules/*.yml" + - ".github/workflows/deploy-grafana-dashboards.yml" + workflow_dispatch: + +jobs: + deploy-dashboards: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set folder name + run: | + if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then + echo "FOLDER_NAME=Smart Git Proxy Production" >> $GITHUB_ENV + else + echo "FOLDER_NAME=Smart Git Proxy Staging" >> $GITHUB_ENV + fi + + - name: Validate JSON syntax + run: | + for dashboard in grafana/dashboards/*.json; do + echo "Validating $dashboard..." + jq . "$dashboard" > /dev/null || exit 1 + done + echo "All dashboard JSON files are valid" + + - name: Deploy dashboards to Self-Hosted Grafana + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' + env: + GRAFANA_URL: ${{ secrets.SELF_HOSTED_GRAFANA_URL }} + GRAFANA_USER: ${{ secrets.SELF_HOSTED_GRAFANA_USER }} + GRAFANA_PASSWORD: ${{ secrets.SELF_HOSTED_GRAFANA_PASSWORD }} + run: | + if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then + UID_SUFFIX="-prod-self" + else + UID_SUFFIX="-staging-self" + fi + + AUTH_HEADER="Authorization: Basic $(echo -n "$GRAFANA_USER:$GRAFANA_PASSWORD" | base64)" + + # Ensure folder exists + FOLDERS_RESPONSE=$(curl -s -H "$AUTH_HEADER" "$GRAFANA_URL/api/folders") + FOLDER_UID=$(echo "$FOLDERS_RESPONSE" | jq -r --arg name "$FOLDER_NAME" '.[] | select(.title == $name) | .uid' | head -1) + + if [ -z "$FOLDER_UID" ] || [ "$FOLDER_UID" == "null" ]; then + FOLDER_UID=$(echo "$FOLDER_NAME" | tr '[:upper:]' '[:lower:]' | tr ' ' '-')-$(date +%s) + CREATE_RESPONSE=$(curl -s -X POST \ + -H "$AUTH_HEADER" \ + -H "Content-Type: application/json" \ + -d "{\"title\": \"$FOLDER_NAME\", \"uid\": \"$FOLDER_UID\"}" \ + "$GRAFANA_URL/api/folders") + if ! echo "$CREATE_RESPONSE" | grep -q '"uid"'; then + echo "Error creating folder: $CREATE_RESPONSE" + exit 1 + fi + fi + + echo "FOLDER_UID=$FOLDER_UID" >> $GITHUB_ENV + + for dashboard_file in grafana/dashboards/*.json; do + if [ -f "$dashboard_file" ]; then + dashboard_name=$(basename "$dashboard_file" .json) + echo "Uploading $dashboard_name..." + + DASHBOARD_JSON=$(cat "$dashboard_file") + + # Append environment suffix to UID + if echo "$DASHBOARD_JSON" | jq -e '.uid' > /dev/null 2>&1; then + ORIGINAL_UID=$(echo "$DASHBOARD_JSON" | jq -r '.uid') + DASHBOARD_JSON=$(echo "$DASHBOARD_JSON" | jq --arg uid "${ORIGINAL_UID}${UID_SUFFIX}" '.uid = $uid') + else + DASHBOARD_JSON=$(echo "$DASHBOARD_JSON" | jq --arg uid "${dashboard_name}${UID_SUFFIX}" '. + {uid: $uid}') + fi + + PAYLOAD_FILE=$(mktemp) + echo "$DASHBOARD_JSON" | jq \ + --arg folderUid "$FOLDER_UID" \ + '{dashboard: ., folderUid: $folderUid, overwrite: true}' > "$PAYLOAD_FILE" + + RESPONSE=$(curl -s -X POST \ + -H "$AUTH_HEADER" \ + -H "Content-Type: application/json" \ + --data-binary "@$PAYLOAD_FILE" \ + "$GRAFANA_URL/api/dashboards/db") + rm -f "$PAYLOAD_FILE" + + if echo "$RESPONSE" | grep -q '"status":"success"'; then + echo " Uploaded $dashboard_name (version: $(echo "$RESPONSE" | jq -r .version))" + else + echo " Failed: $RESPONSE" + echo " Continuing..." + fi + fi + done + + - name: Deploy alert rules to Self-Hosted Grafana + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' + env: + GRAFANA_URL: ${{ secrets.SELF_HOSTED_GRAFANA_URL }} + GRAFANA_USER: ${{ secrets.SELF_HOSTED_GRAFANA_USER }} + GRAFANA_PASSWORD: ${{ secrets.SELF_HOSTED_GRAFANA_PASSWORD }} + run: | + AUTH_HEADER="Authorization: Basic $(echo -n "$GRAFANA_USER:$GRAFANA_PASSWORD" | base64)" + + if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then + DASHBOARD_UID_SUFFIX="-prod-self" + else + DASHBOARD_UID_SUFFIX="-staging-self" + fi + + # Resolve Prometheus datasource UID + DATASOURCES_RESPONSE=$(curl -s -H "$AUTH_HEADER" "$GRAFANA_URL/api/datasources") + DATASOURCE_UID=$(echo "$DATASOURCES_RESPONSE" | jq -r '[.[] | select(.type=="prometheus")] | .[0].uid // empty') + if [ -z "$DATASOURCE_UID" ] || [ "$DATASOURCE_UID" = "null" ]; then + echo "Could not resolve Prometheus datasource UID" + exit 1 + fi + echo "Using Prometheus datasource UID: $DATASOURCE_UID" + + # Use same folder as dashboards + ALERT_FOLDER_UID="${FOLDER_UID}" + + shopt -s nullglob + alert_files=(grafana/alert-rules/*.yaml grafana/alert-rules/*.yml) + if [ ${#alert_files[@]} -eq 0 ]; then + echo "No alert rule files found, skipping" + exit 0 + fi + + # Install yq + if ! command -v yq &> /dev/null; then + wget -q https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O yq + chmod +x yq + YQ_CMD="./yq" + else + YQ_CMD="yq" + fi + + for alert_file in "${alert_files[@]}"; do + alert_name=$(basename "$alert_file" .yaml) + alert_name=$(basename "$alert_name" .yml) + echo "Uploading alert rules from $alert_name..." + + ALERT_JSON=$($YQ_CMD eval -o=json "$alert_file") + + # Replace datasource placeholders and template variables + ALERT_JSON=$(echo "$ALERT_JSON" | jq --arg uid "$DATASOURCE_UID" ' + .groups |= map( + del(.folder) | + .rules |= map( + .data |= map( + (if (has("datasourceUid") and .datasourceUid == "${datasource}") then + .datasourceUid = $uid + else . end) | + if has("model") and (.model | has("expr")) then + .model.expr = (.model.expr | + gsub("\\$instance"; ".*") | + gsub("instance=~\"\\*\""; "instance=~\".*\"")) + else . end + ) + ) + )') + + GROUPS_COUNT=$(echo "$ALERT_JSON" | jq '.groups | length') + for group_idx in $(seq 0 $((GROUPS_COUNT - 1))); do + RULES_COUNT=$(echo "$ALERT_JSON" | jq --argjson g "$group_idx" '.groups[$g].rules | length') + GROUP_NAME=$(echo "$ALERT_JSON" | jq -r --argjson g "$group_idx" '.groups[$g].name') + GROUP_INTERVAL=$(echo "$ALERT_JSON" | jq -r --argjson g "$group_idx" '.groups[$g].interval // "1m"') + + for rule_idx in $(seq 0 $((RULES_COUNT - 1))); do + RULE=$(echo "$ALERT_JSON" | jq --argjson g "$group_idx" --argjson idx "$rule_idx" '.groups[$g].rules[$idx]') + ORIGINAL_RULE_UID=$(echo "$RULE" | jq -r '.uid') + + if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then + RULE_UID="${ORIGINAL_RULE_UID}_p" + else + RULE_UID="${ORIGINAL_RULE_UID}_s" + fi + + RULE_PAYLOAD=$(echo "$RULE" | jq \ + --arg folderUid "$ALERT_FOLDER_UID" \ + --arg groupName "$GROUP_NAME" \ + --arg interval "$GROUP_INTERVAL" \ + --arg ruleUid "$RULE_UID" ' + . + {uid: $ruleUid, folderUID: $folderUid, ruleGroup: $groupName, interval: $interval}') + + RULE_PAYLOAD=$(echo "$RULE_PAYLOAD" | jq --arg suffix "$DASHBOARD_UID_SUFFIX" ' + if (.annotations.__dashboardUid__ // "") != "" then + .annotations.__dashboardUid__ += $suffix + else . end') + + # Upsert rule + EXISTING_RULE=$(curl -s -H "$AUTH_HEADER" "$GRAFANA_URL/api/v1/provisioning/alert-rules/$RULE_UID") + if echo "$EXISTING_RULE" | grep -q '"uid"'; then + RESPONSE=$(curl -s -X PUT \ + -H "$AUTH_HEADER" -H "Content-Type: application/json" -H "X-Disable-Provenance: true" \ + -d "$RULE_PAYLOAD" "$GRAFANA_URL/api/v1/provisioning/alert-rules/$RULE_UID") + else + RESPONSE=$(curl -s -X POST \ + -H "$AUTH_HEADER" -H "Content-Type: application/json" -H "X-Disable-Provenance: true" \ + -d "$RULE_PAYLOAD" "$GRAFANA_URL/api/v1/provisioning/alert-rules") + fi + + if echo "$RESPONSE" | grep -q '"uid"'; then + echo " Processed rule: $RULE_UID" + else + echo " Failed rule $RULE_UID: $RESPONSE" + exit 1 + fi + done + done + done + + echo "Alert rules deployment complete!" From f3198b971fe30c6fe601197156d3f3e15f57031e Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 12:56:39 +0000 Subject: [PATCH 06/17] fix bugbot issues: deploy gating, branch resolution, alert UIDs - Add deploy-target step that centralizes env/branch resolution; production pushes now correctly trigger Ansible deploy - Manual workflow_dispatch uses inputs.branch for Ansible instead of the checked-out ref name - Add unique uid to each alert rule (sgp-node-down, sgp-error-rate, sgp-latency-high, sgp-sync-failures) so Grafana provisioning upserts correctly Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .github/workflows/build-and-deploy.yaml | 30 +++++++++++++------ .../alert-rules/smart-git-proxy-alerts.yaml | 12 +++++--- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml index 9431e86..aaf72f1 100644 --- a/.github/workflows/build-and-deploy.yaml +++ b/.github/workflows/build-and-deploy.yaml @@ -80,20 +80,32 @@ jobs: rclone copy ./smart-git-proxy r2:useblacksmith/smart-git-proxy/main/$SHA fi + - name: Determine deploy target + id: deploy-target + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + run: | + if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then + echo "env=production" >> $GITHUB_OUTPUT + echo "branch=production" >> $GITHUB_OUTPUT + else + echo "env=staging" >> $GITHUB_OUTPUT + echo "branch=${{ github.event.inputs.branch || github.ref_name }}" >> $GITHUB_OUTPUT + fi + - name: Set up Python - if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + if: steps.deploy-target.outcome == 'success' uses: actions/setup-python@v5 with: python-version: "3.x" - name: Install Ansible - if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + if: steps.deploy-target.outcome == 'success' run: | python -m pip install --upgrade pip pip install ansible - name: Connect to Tailscale - if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + if: steps.deploy-target.outcome == 'success' uses: tailscale/github-action@v3 with: oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }} @@ -101,15 +113,15 @@ jobs: tags: tag:ci - name: Create Ansible Vault password file - if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + if: steps.deploy-target.outcome == 'success' run: echo "${{ secrets.ANSIBLE_SECRET }}" > ~/vault-password.txt - name: Check host connectivity - if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + if: steps.deploy-target.outcome == 'success' working-directory: ansible run: | sleep 10 - if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then + if [[ "${{ steps.deploy-target.outputs.env }}" == "production" ]]; then INVENTORY_FILES="production-usw.ini production-euw.ini production-euc.ini" else INVENTORY_FILES="staging.ini" @@ -145,11 +157,11 @@ jobs: - name: Run Ansible rolling deploy id: ansible-deploy - if: github.ref == 'refs/heads/main' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + if: steps.deploy-target.outcome == 'success' working-directory: ansible run: | - BRANCH_ARG="-e branch=${{ github.ref_name }}" - if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then + BRANCH_ARG="-e branch=${{ steps.deploy-target.outputs.branch }}" + if [[ "${{ steps.deploy-target.outputs.env }}" == "production" ]]; then for REGION_INI in production-usw.ini production-euw.ini production-euc.ini; do echo "=== Rolling region: $REGION_INI ===" ANSIBLE_CONFIG=./ansible.cfg ansible-playbook -i "$REGION_INI" \ diff --git a/grafana/alert-rules/smart-git-proxy-alerts.yaml b/grafana/alert-rules/smart-git-proxy-alerts.yaml index 441e82f..42d5296 100644 --- a/grafana/alert-rules/smart-git-proxy-alerts.yaml +++ b/grafana/alert-rules/smart-git-proxy-alerts.yaml @@ -12,7 +12,8 @@ groups: - name: smart-git-proxy interval: 1m rules: - - alert: ProxyNodeDown + - uid: sgp-node-down + alert: ProxyNodeDown expr: up{job="smart-git-proxy"} == 0 for: 2m labels: @@ -21,7 +22,8 @@ groups: summary: "Smart git proxy node {{ $labels.instance }} is down" description: "Health endpoint unreachable for 2 minutes." - - alert: ErrorRateHigh + - uid: sgp-error-rate + alert: ErrorRateHigh expr: sum(rate(smart_git_proxy_errors_total[5m])) by (instance) > 0.5 for: 5m labels: @@ -30,7 +32,8 @@ groups: summary: "Elevated error rate on {{ $labels.instance }}" description: "Errors at {{ $value | humanize }}/sec. May indicate upstream GitHub issues or local git failures." - - alert: RequestLatencyHigh + - uid: sgp-latency-high + alert: RequestLatencyHigh expr: histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket[5m])) by (le, instance)) > 30 for: 5m labels: @@ -39,7 +42,8 @@ groups: summary: "p95 request latency >30s on {{ $labels.instance }}" description: "Slow git request serving — may indicate NVMe I/O pressure or excessive concurrent requests." - - alert: SyncFailureRateHigh + - uid: sgp-sync-failures + alert: SyncFailureRateHigh expr: sum(rate(smart_git_proxy_sync_total{result="error"}[5m])) by (instance) > 0.5 for: 5m labels: From 5861eca7c1f03e18cd94606245775880a1440b72 Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 13:24:02 +0000 Subject: [PATCH 07/17] ansible: add hydrate playbook for raw machine bootstrapping - Populate staging.ini and production-usw.ini with provisioned IPs - Add hydrate.yaml: full bootstrapping (hostname, packages, Tailscale, NVMe format+mount, Grafana Alloy, service config, Vector/Axiom) - Add task files: setup_hostname, install_dependencies, setup_tailscale, configure_tailscale_firewall, setup_nvme, setup_grafana_alloy, setup_service - Add requirements.yml for artis3n.tailscale collection - Refactor setup.yaml to reuse shared tasks (NVMe, Alloy, service, Axiom) - Add Grafana Alloy for Prometheus metric scraping (scrapes /metrics, remote writes to self-hosted Prometheus) Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/README.md | 65 ++++++++++++--- ansible/ansible.cfg | 3 + ansible/hydrate.yaml | 38 +++++++++ ansible/production-usw.ini | 11 +-- ansible/requirements.yml | 4 + ansible/secrets.yml | 2 +- ansible/setup.yaml | 65 +++------------ ansible/staging.ini | 7 +- .../tasks/configure_tailscale_firewall.yaml | 43 ++++++++++ ansible/tasks/install_dependencies.yaml | 24 ++++++ ansible/tasks/setup_grafana_alloy.yaml | 74 +++++++++++++++++ ansible/tasks/setup_hostname.yaml | 19 +++++ ansible/tasks/setup_nvme.yaml | 82 +++++++++++++++++++ ansible/tasks/setup_service.yaml | 33 ++++++++ ansible/tasks/setup_tailscale.yaml | 23 ++++++ 15 files changed, 419 insertions(+), 74 deletions(-) create mode 100644 ansible/hydrate.yaml create mode 100644 ansible/requirements.yml create mode 100644 ansible/tasks/configure_tailscale_firewall.yaml create mode 100644 ansible/tasks/install_dependencies.yaml create mode 100644 ansible/tasks/setup_grafana_alloy.yaml create mode 100644 ansible/tasks/setup_hostname.yaml create mode 100644 ansible/tasks/setup_nvme.yaml create mode 100644 ansible/tasks/setup_service.yaml create mode 100644 ansible/tasks/setup_tailscale.yaml diff --git a/ansible/README.md b/ansible/README.md index d9c75a5..bc999f0 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -6,39 +6,70 @@ Playbooks and templates for deploying smart-git-proxy to dedicated proxy nodes. ``` ansible/ - setup.yaml # One-time setup: systemd unit, NVMe mount, Vector logging - roll.yaml # Rolling deploy: build or download binary, restart + hydrate.yaml # Full bootstrapping: hostname, packages, Tailscale, + # NVMe, Alloy, service config, Vector logging + setup.yaml # Incremental setup: NVMe, Alloy, service config, Vector + roll.yaml # Rolling deploy: build or download binary, restart + requirements.yml # Ansible Galaxy dependencies (artis3n.tailscale) tasks/ - setup_axiom.yaml # Vector → Axiom log forwarding + setup_hostname.yaml # Set hostname and /etc/hosts + install_dependencies.yaml # apt packages (git, make, jq, xfsprogs, etc.) + setup_tailscale.yaml # Install and configure Tailscale with SSH + configure_tailscale_firewall.yaml # UFW rules to block netscanning + setup_nvme.yaml # Detect, format (XFS), and mount NVMe data drive + setup_grafana_alloy.yaml # Prometheus scrape → remote write to Grafana + setup_service.yaml # systemd unit and env config + setup_axiom.yaml # Vector → Axiom log forwarding templates/ smart-git-proxy.service.j2 # systemd unit smart-git-proxy-env.j2 # Environment config vector.yaml.j2 # Vector config for Axiom - production-usw.ini # US-West inventory (3 nodes) - production-euw.ini # EU-West inventory (2 nodes) - production-euc.ini # EU-Central inventory (2 nodes) - staging.ini # Staging inventory - secrets.yml # ansible-vault encrypted secrets + production-usw.ini # US-West inventory (3 nodes) + production-euw.ini # EU-West inventory (placeholder) + production-euc.ini # EU-Central inventory (placeholder) + staging.ini # Staging inventory (1 node) + secrets.yml # ansible-vault encrypted secrets +``` + +## Prerequisites + +```bash +# Install Ansible Galaxy dependencies +ansible-galaxy collection install -r requirements.yml ``` ## Usage -### Initial Setup (once per node) +### Hydrate Raw Machines (once per node, first time) + +Bootstraps a freshly provisioned machine from scratch: sets hostname, installs +packages, configures Tailscale + firewall, formats NVMe, sets up Alloy metrics, +systemd service, and Vector log forwarding. ```bash -ansible-playbook -i production-usw.ini setup.yaml --ask-vault-pass +ansible-playbook -i staging.ini hydrate.yaml --vault-password-file ~/vault-password.txt +ansible-playbook -i production-usw.ini hydrate.yaml --vault-password-file ~/vault-password.txt +``` + +### Incremental Setup (re-run safe) + +Updates service config, systemd unit, Alloy, and Vector without touching +Tailscale or re-formatting NVMe. Safe to re-run. + +```bash +ansible-playbook -i production-usw.ini setup.yaml --vault-password-file ~/vault-password.txt ``` ### Deploy from Branch ```bash -ansible-playbook -i production-usw.ini roll.yaml -e branch=main --ask-vault-pass +ansible-playbook -i production-usw.ini roll.yaml -e branch=main --vault-password-file ~/vault-password.txt ``` ### Deploy from Release ```bash -ansible-playbook -i production-usw.ini roll.yaml -e release_tag=v1.0.0 --ask-vault-pass +ansible-playbook -i production-usw.ini roll.yaml -e release_tag=v1.0.0 --vault-password-file ~/vault-password.txt ``` ## Configuration @@ -58,3 +89,13 @@ Key environment variables (set in `templates/smart-git-proxy-env.j2`): `secrets.yml` must contain (ansible-vault encrypted): - `github_token` — GitHub token for cloning the repo during branch builds - `axiom_token` — Axiom API token for log forwarding +- `tailscale_auth_key` — Tailscale auth key (tag: `git-proxy`) + +## Inventory + +Hosts are listed by public IP for initial hydration (SSH over public internet). +After hydration, Tailscale IPs are used for rolling deploys via CI (the +build-and-deploy workflow connects through Tailscale VPN). + +After hydration, note each node's Tailscale IP and update the inventory files +if you want to switch to Tailscale-based SSH for subsequent runs. diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg index 6d2dcd6..6dca555 100644 --- a/ansible/ansible.cfg +++ b/ansible/ansible.cfg @@ -1,3 +1,6 @@ [defaults] host_key_checking = False timeout = 30 + +[galaxy] +collections_path = ~/.ansible/collections diff --git a/ansible/hydrate.yaml b/ansible/hydrate.yaml new file mode 100644 index 0000000..fd3c219 --- /dev/null +++ b/ansible/hydrate.yaml @@ -0,0 +1,38 @@ +--- +# Full hydration for raw machines: hostname, packages, Tailscale, NVMe, service config, logging. +# Run once per node after provisioning. +# +# Usage: +# ansible-playbook -i staging.ini hydrate.yaml --vault-password-file ~/vault-password.txt +# ansible-playbook -i production-usw.ini hydrate.yaml --vault-password-file ~/vault-password.txt + +- name: Hydrate Smart Git Proxy Nodes + hosts: all + become: yes + serial: 4 + vars_files: + - secrets.yml + tasks: + - name: Set hostname + include_tasks: tasks/setup_hostname.yaml + + - name: Install base packages + include_tasks: tasks/install_dependencies.yaml + + - name: Set up Tailscale + include_tasks: tasks/setup_tailscale.yaml + + - name: Configure Tailscale firewall rules + include_tasks: tasks/configure_tailscale_firewall.yaml + + - name: Set up NVMe storage + include_tasks: tasks/setup_nvme.yaml + + - name: Set up Grafana Alloy for metrics + include_tasks: tasks/setup_grafana_alloy.yaml + + - name: Set up service config and systemd + include_tasks: tasks/setup_service.yaml + + - name: Set up Axiom log forwarding + include_tasks: tasks/setup_axiom.yaml diff --git a/ansible/production-usw.ini b/ansible/production-usw.ini index c6bf394..45462f5 100644 --- a/ansible/production-usw.ini +++ b/ansible/production-usw.ini @@ -1,10 +1,11 @@ -# US-West proxy nodes (3 nodes) -# Update with actual IPs after provisioning. +# US-West proxy nodes (3 nodes, s4.s2.large, PhoenixNAP, 36mo reservation) +# Provisioned 2026-06-17 [all:vars] +ansible_user=ubuntu region=us-west env=production [proxy] -# proxy-usw-1 ansible_host= -# proxy-usw-2 ansible_host= -# proxy-usw-3 ansible_host= +192.240.240.207 hostname=git-proxy-usw-1 server_id=6a329c5bcd195a90018570da +192.240.240.208 hostname=git-proxy-usw-2 server_id=6a329c5dcd195a90018570db +192.240.240.209 hostname=git-proxy-usw-3 server_id=6a329c5ecd195a90018570dc diff --git a/ansible/requirements.yml b/ansible/requirements.yml new file mode 100644 index 0000000..9141d26 --- /dev/null +++ b/ansible/requirements.yml @@ -0,0 +1,4 @@ +--- +collections: + - name: artis3n.tailscale + version: ">=4.0.0" diff --git a/ansible/secrets.yml b/ansible/secrets.yml index e1ed311..f0c5c9c 100644 --- a/ansible/secrets.yml +++ b/ansible/secrets.yml @@ -1,3 +1,3 @@ # Encrypted with ansible-vault. -# Contains: github_token, axiom_token +# Contains: github_token, axiom_token, tailscale_auth_key # To edit: ansible-vault edit secrets.yml diff --git a/ansible/setup.yaml b/ansible/setup.yaml index e86e001..ce3d4d5 100644 --- a/ansible/setup.yaml +++ b/ansible/setup.yaml @@ -1,6 +1,9 @@ --- -# One-time setup: deploy systemd unit, env config, NVMe mount, Vector logging. -# Does not start the service -- the roll playbook handles that. +# Incremental setup: deploy service config, systemd unit, NVMe validation, Vector logging. +# For raw machine bootstrapping (including Tailscale, NVMe format, packages), use hydrate.yaml. +# +# Usage: +# ansible-playbook -i production-usw.ini setup.yaml --vault-password-file ~/vault-password.txt - name: Setup Smart Git Proxy hosts: all @@ -8,58 +11,14 @@ vars_files: - secrets.yml tasks: - - name: Determine environment from inventory file name - set_fact: - ENV: "{{ 'production' if 'production' in inventory_file else 'staging' if 'staging' in inventory_file else '' }}" + - name: Set up NVMe storage + include_tasks: tasks/setup_nvme.yaml - - name: Ensure ENV is determined - fail: - msg: "Failed to determine environment from inventory file name." - when: ENV == '' + - name: Set up Grafana Alloy for metrics + include_tasks: tasks/setup_grafana_alloy.yaml - # --- NVMe storage --- - - name: Verify NVMe mount exists - stat: - path: /mnt/nvme - register: nvme_mount + - name: Set up service config and systemd + include_tasks: tasks/setup_service.yaml - - name: Fail if NVMe mount is missing - fail: - msg: "/mnt/nvme does not exist. Proxy nodes require NVMe storage for git mirrors." - when: not nvme_mount.stat.exists - - - name: Create mirror directory - file: - path: "{{ mirror_dir | default('/mnt/nvme/mirrors') }}" - state: directory - mode: "0755" - - # --- Config --- - - name: Ensure config directory exists - file: - path: /etc/smart-git-proxy - state: directory - mode: "0755" - - - name: Deploy environment config - template: - src: templates/smart-git-proxy-env.j2 - dest: /etc/smart-git-proxy/env - mode: "0600" - - # --- systemd --- - - name: Deploy systemd unit - template: - src: templates/smart-git-proxy.service.j2 - dest: /etc/systemd/system/smart-git-proxy.service - mode: "0644" - register: systemd_unit - - - name: Reload systemd daemon - systemd: - daemon_reload: yes - when: systemd_unit.changed - - # --- Axiom log forwarding --- - - name: Setup Axiom log forwarding + - name: Set up Axiom log forwarding include_tasks: tasks/setup_axiom.yaml diff --git a/ansible/staging.ini b/ansible/staging.ini index b5eb016..7258823 100644 --- a/ansible/staging.ini +++ b/ansible/staging.ini @@ -1,8 +1,9 @@ -# Staging proxy nodes (1 node per region or shared) -# Update with actual IPs after provisioning. +# Staging proxy node (1 node, s4.s2.large, PhoenixNAP, 36mo reservation) +# Provisioned 2026-06-17 [all:vars] +ansible_user=ubuntu region=us-west env=staging [proxy] -# proxy-staging-1 ansible_host= +192.240.240.210 hostname=git-proxy-staging-1 server_id=6a329cb6cd195a90018570de diff --git a/ansible/tasks/configure_tailscale_firewall.yaml b/ansible/tasks/configure_tailscale_firewall.yaml new file mode 100644 index 0000000..1b41c03 --- /dev/null +++ b/ansible/tasks/configure_tailscale_firewall.yaml @@ -0,0 +1,43 @@ +--- +- name: Ensure UFW is installed + apt: + name: ufw + state: present + +- name: Enable UFW with default allow + ufw: + state: enabled + policy: allow + direction: incoming + logging: "on" + +- name: Allow Tailscale traffic within Tailscale network + ufw: + rule: allow + direction: out + to_ip: 100.64.0.0/10 + port: "41641" + proto: udp + +- name: Block Tailscale UDP scanning to private ranges + ufw: + rule: deny + direction: out + to_ip: "{{ item }}" + port: "41641" + proto: udp + loop: + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + +- name: Block all other Tailscale outbound scanning + ufw: + rule: deny + direction: out + port: "41641" + proto: udp + +- name: Reload UFW + ufw: + state: reloaded diff --git a/ansible/tasks/install_dependencies.yaml b/ansible/tasks/install_dependencies.yaml new file mode 100644 index 0000000..6f84aa5 --- /dev/null +++ b/ansible/tasks/install_dependencies.yaml @@ -0,0 +1,24 @@ +--- +- name: Wait for dpkg lock + shell: while lsof /var/lib/dpkg/lock-frontend 2>/dev/null; do sleep 10; done; + +- name: Update apt cache + apt: + update_cache: yes + retries: 5 + delay: 10 + register: apt_update + until: apt_update is success + +- name: Install required packages + apt: + name: + - curl + - wget + - git + - make + - jq + - xfsprogs + - nvme-cli + - ufw + state: present diff --git a/ansible/tasks/setup_grafana_alloy.yaml b/ansible/tasks/setup_grafana_alloy.yaml new file mode 100644 index 0000000..6965037 --- /dev/null +++ b/ansible/tasks/setup_grafana_alloy.yaml @@ -0,0 +1,74 @@ +--- +- name: Set Grafana endpoints + set_fact: + self_hosted_prometheus_url: "http://grafana.internal.blacksmith.sh:9090/api/v1/write" + +- name: Install Grafana Alloy + shell: | + ARCH=$(dpkg --print-architecture) + wget -q -O /tmp/alloy.deb "https://github.com/grafana/alloy/releases/download/v1.5.1/alloy-1.5.1-1.${ARCH}.deb" + DEBIAN_FRONTEND=noninteractive dpkg -i --force-confnew /tmp/alloy.deb || apt-get install -f -y + rm -f /tmp/alloy.deb + systemctl daemon-reload + args: + creates: /usr/bin/alloy + +- name: Create Alloy configuration directory + file: + path: /etc/alloy + state: directory + mode: "0755" + +- name: Configure Grafana Alloy + copy: + content: | + // Prometheus scrape of local smart-git-proxy /metrics endpoint + prometheus.scrape "smart_git_proxy" { + targets = [{ + __address__ = "127.0.0.1:8080", + }] + metrics_path = "/metrics" + scrape_interval = "15s" + forward_to = [prometheus.relabel.add_labels.receiver] + } + + // Add environment/region labels + prometheus.relabel "add_labels" { + forward_to = [prometheus.remote_write.self_hosted.receiver] + + rule { + target_label = "environment" + replacement = "{{ env }}" + } + rule { + target_label = "region" + replacement = "{{ region }}" + } + rule { + target_label = "job" + replacement = "smart-git-proxy" + } + } + + // Remote write to self-hosted Prometheus + prometheus.remote_write "self_hosted" { + endpoint { + url = "{{ self_hosted_prometheus_url }}" + } + } + dest: /etc/alloy/config.alloy + mode: "0644" + register: alloy_config + +- name: Enable and restart Alloy + systemd: + name: alloy + state: restarted + enabled: yes + when: alloy_config.changed + +- name: Ensure Alloy is running + systemd: + name: alloy + state: started + enabled: yes diff --git a/ansible/tasks/setup_hostname.yaml b/ansible/tasks/setup_hostname.yaml new file mode 100644 index 0000000..82d528e --- /dev/null +++ b/ansible/tasks/setup_hostname.yaml @@ -0,0 +1,19 @@ +--- +- name: Set hostname + hostname: + name: "{{ hostname }}" + +- name: Add hostname to /etc/hosts + lineinfile: + path: /etc/hosts + line: "{{ ansible_default_ipv4.address }} {{ hostname }}" + regexp: "^{{ ansible_default_ipv4.address }}" + state: present + +- name: Prevent cloud-init from overwriting hostname + copy: + content: | + #cloud-config + preserve_hostname: true + dest: /etc/cloud/cloud.cfg.d/99-preserve-hostname.cfg + mode: "0644" diff --git a/ansible/tasks/setup_nvme.yaml b/ansible/tasks/setup_nvme.yaml new file mode 100644 index 0000000..c9e9e87 --- /dev/null +++ b/ansible/tasks/setup_nvme.yaml @@ -0,0 +1,82 @@ +--- +# s4.s2.large machines have 1x 1TB OS + 2x 8TB NVMe data drives. +# We format and mount one data drive for git mirrors. + +- name: Get block device info + command: lsblk -J -o NAME,SIZE,TYPE,MOUNTPOINT,FSTYPE + register: lsblk_output + +- name: Parse block devices + set_fact: + block_devices: "{{ (lsblk_output.stdout | from_json).blockdevices }}" + +- name: Find data NVMe drives (>2TB, no OS partitions) + set_fact: + data_drives: >- + {%- set drives = [] -%} + {%- for device in block_devices -%} + {%- if device.type == 'disk' -%} + {%- set has_os = [] -%} + {%- if device.children is defined -%} + {%- for child in device.children -%} + {%- if child.mountpoint in ['/', '/boot', '/boot/efi'] -%} + {%- if has_os.append(true) -%}{%- endif -%} + {%- endif -%} + {%- if child.children is defined -%} + {%- for gc in child.children -%} + {%- if gc.mountpoint in ['/', '/boot', '/boot/efi'] -%} + {%- if has_os.append(true) -%}{%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set size_tb = device.size | regex_replace('[^0-9.]', '') | float -%} + {%- if has_os | length == 0 and device.mountpoint is none and size_tb > 2 -%} + {%- if drives.append(device.name) -%}{%- endif -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {{ drives }} + +- name: Display detected data drives + debug: + msg: "Data NVMe drives: {{ data_drives }}" + +- name: Fail if no data drives found + fail: + msg: "No NVMe data drives >2TB found. Expected s4.s2.large with 2x 8TB NVMe." + when: data_drives | length == 0 + +- name: Check if /mnt/nvme is already mounted + command: mountpoint -q /mnt/nvme + register: nvme_mounted + failed_when: false + changed_when: false + +- name: Format first data drive as XFS + filesystem: + fstype: xfs + dev: "/dev/{{ data_drives[0] }}" + force: no + when: nvme_mounted.rc != 0 + +- name: Create /mnt/nvme mount point + file: + path: /mnt/nvme + state: directory + mode: "0755" + +- name: Mount NVMe drive + mount: + path: /mnt/nvme + src: "/dev/{{ data_drives[0] }}" + fstype: xfs + opts: defaults,noatime + state: mounted + +- name: Create mirrors directory + file: + path: /mnt/nvme/mirrors + state: directory + mode: "0755" diff --git a/ansible/tasks/setup_service.yaml b/ansible/tasks/setup_service.yaml new file mode 100644 index 0000000..5e630d5 --- /dev/null +++ b/ansible/tasks/setup_service.yaml @@ -0,0 +1,33 @@ +--- +- name: Determine environment from inventory + set_fact: + ENV: "{{ env }}" + +- name: Ensure config directory exists + file: + path: /etc/smart-git-proxy + state: directory + mode: "0755" + +- name: Deploy environment config + template: + src: templates/smart-git-proxy-env.j2 + dest: /etc/smart-git-proxy/env + mode: "0600" + +- name: Deploy systemd unit + template: + src: templates/smart-git-proxy.service.j2 + dest: /etc/systemd/system/smart-git-proxy.service + mode: "0644" + register: systemd_unit + +- name: Reload systemd daemon + systemd: + daemon_reload: yes + when: systemd_unit.changed + +- name: Enable smart-git-proxy service + systemd: + name: smart-git-proxy + enabled: yes diff --git a/ansible/tasks/setup_tailscale.yaml b/ansible/tasks/setup_tailscale.yaml new file mode 100644 index 0000000..0a5a901 --- /dev/null +++ b/ansible/tasks/setup_tailscale.yaml @@ -0,0 +1,23 @@ +--- +- name: Include Tailscale role + ansible.builtin.include_role: + name: artis3n.tailscale + vars: + tailscale_authkey: "{{ tailscale_auth_key }}" + tailscale_args: "--ssh --hostname {{ hostname }}" + tailscale_tags: "{{ ['git-proxy'] }}" + +- name: Ensure Tailscale is running + ansible.builtin.systemd: + name: tailscaled + state: started + enabled: yes + +- name: Get Tailscale IP + ansible.builtin.command: tailscale ip + register: tailscale_ip + changed_when: false + +- name: Display Tailscale IP + ansible.builtin.debug: + msg: "Tailscale IP for {{ hostname }}: {{ tailscale_ip.stdout }}" From b862c45c3441c2bb58a19ae7df920afe900711fa Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 13:28:06 +0000 Subject: [PATCH 08/17] ansible: fix inventory to use Tailscale hostnames as keys - Use Tailscale hostnames (git-proxy-usw-1, etc.) as inventory host keys so tailscale ping and Ansible connect via MagicDNS from CI - Keep ansible_host= for initial hydration (SSH over public IP) - Fix checkout to use branch input on workflow_dispatch - Remove hostname inventory var (inventory_hostname is the hostname now) Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .github/workflows/build-and-deploy.yaml | 2 ++ ansible/production-usw.ini | 10 +++++++--- ansible/staging.ini | 6 +++++- ansible/tasks/setup_hostname.yaml | 4 ++-- ansible/tasks/setup_tailscale.yaml | 4 ++-- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml index aaf72f1..bde9dd2 100644 --- a/.github/workflows/build-and-deploy.yaml +++ b/.github/workflows/build-and-deploy.yaml @@ -33,6 +33,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch || github.ref }} - name: Set up Go uses: actions/setup-go@v5 diff --git a/ansible/production-usw.ini b/ansible/production-usw.ini index 45462f5..f17ada7 100644 --- a/ansible/production-usw.ini +++ b/ansible/production-usw.ini @@ -1,11 +1,15 @@ # US-West proxy nodes (3 nodes, s4.s2.large, PhoenixNAP, 36mo reservation) # Provisioned 2026-06-17 +# +# Host keys are Tailscale hostnames (resolved via MagicDNS from CI). +# ansible_host is the public IP (used for initial hydration before Tailscale). +# After hydration, remove ansible_host lines to connect via Tailscale. [all:vars] ansible_user=ubuntu region=us-west env=production [proxy] -192.240.240.207 hostname=git-proxy-usw-1 server_id=6a329c5bcd195a90018570da -192.240.240.208 hostname=git-proxy-usw-2 server_id=6a329c5dcd195a90018570db -192.240.240.209 hostname=git-proxy-usw-3 server_id=6a329c5ecd195a90018570dc +git-proxy-usw-1 ansible_host=192.240.240.207 server_id=6a329c5bcd195a90018570da +git-proxy-usw-2 ansible_host=192.240.240.208 server_id=6a329c5dcd195a90018570db +git-proxy-usw-3 ansible_host=192.240.240.209 server_id=6a329c5ecd195a90018570dc diff --git a/ansible/staging.ini b/ansible/staging.ini index 7258823..a0fd252 100644 --- a/ansible/staging.ini +++ b/ansible/staging.ini @@ -1,9 +1,13 @@ # Staging proxy node (1 node, s4.s2.large, PhoenixNAP, 36mo reservation) # Provisioned 2026-06-17 +# +# Host keys are Tailscale hostnames (resolved via MagicDNS from CI). +# ansible_host is the public IP (used for initial hydration before Tailscale). +# After hydration, remove ansible_host lines to connect via Tailscale. [all:vars] ansible_user=ubuntu region=us-west env=staging [proxy] -192.240.240.210 hostname=git-proxy-staging-1 server_id=6a329cb6cd195a90018570de +git-proxy-staging-1 ansible_host=192.240.240.210 server_id=6a329cb6cd195a90018570de diff --git a/ansible/tasks/setup_hostname.yaml b/ansible/tasks/setup_hostname.yaml index 82d528e..4679323 100644 --- a/ansible/tasks/setup_hostname.yaml +++ b/ansible/tasks/setup_hostname.yaml @@ -1,12 +1,12 @@ --- - name: Set hostname hostname: - name: "{{ hostname }}" + name: "{{ inventory_hostname }}" - name: Add hostname to /etc/hosts lineinfile: path: /etc/hosts - line: "{{ ansible_default_ipv4.address }} {{ hostname }}" + line: "{{ ansible_default_ipv4.address }} {{ inventory_hostname }}" regexp: "^{{ ansible_default_ipv4.address }}" state: present diff --git a/ansible/tasks/setup_tailscale.yaml b/ansible/tasks/setup_tailscale.yaml index 0a5a901..11ca6d2 100644 --- a/ansible/tasks/setup_tailscale.yaml +++ b/ansible/tasks/setup_tailscale.yaml @@ -4,7 +4,7 @@ name: artis3n.tailscale vars: tailscale_authkey: "{{ tailscale_auth_key }}" - tailscale_args: "--ssh --hostname {{ hostname }}" + tailscale_args: "--ssh --hostname {{ inventory_hostname }}" tailscale_tags: "{{ ['git-proxy'] }}" - name: Ensure Tailscale is running @@ -20,4 +20,4 @@ - name: Display Tailscale IP ansible.builtin.debug: - msg: "Tailscale IP for {{ hostname }}: {{ tailscale_ip.stdout }}" + msg: "Tailscale IP for {{ inventory_hostname }}: {{ tailscale_ip.stdout }}" From 8d74545e645038da11818b8d546a7ffb9ae5b31b Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 13:31:51 +0000 Subject: [PATCH 09/17] ansible: fix rclone config indentation and add instance label to Alloy - Replace heredoc with printf to avoid YAML indentation leaking into rclone.conf - Add instance label (inventory_hostname) to Alloy relabel rules so per-node dashboards and by(instance) alerts can distinguish hosts Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .github/workflows/build-and-deploy.yaml | 10 ++-------- ansible/tasks/setup_grafana_alloy.yaml | 4 ++++ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml index bde9dd2..f012157 100644 --- a/.github/workflows/build-and-deploy.yaml +++ b/.github/workflows/build-and-deploy.yaml @@ -62,14 +62,8 @@ jobs: R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} run: | mkdir -p ~/.config/rclone - cat > ~/.config/rclone/rclone.conf < ~/.config/rclone/rclone.conf - name: Push binary to R2 if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch' diff --git a/ansible/tasks/setup_grafana_alloy.yaml b/ansible/tasks/setup_grafana_alloy.yaml index 6965037..bb44c89 100644 --- a/ansible/tasks/setup_grafana_alloy.yaml +++ b/ansible/tasks/setup_grafana_alloy.yaml @@ -48,6 +48,10 @@ target_label = "job" replacement = "smart-git-proxy" } + rule { + target_label = "instance" + replacement = "{{ inventory_hostname }}" + } } // Remote write to self-hosted Prometheus From 1efbbfd421cf25212d60596134ed0d71767e7fc2 Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 13:33:46 +0000 Subject: [PATCH 10/17] ansible: widen Slack failure notification to any step failure Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .github/workflows/build-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml index f012157..a02d136 100644 --- a/.github/workflows/build-and-deploy.yaml +++ b/.github/workflows/build-and-deploy.yaml @@ -173,7 +173,7 @@ jobs: ANSIBLE_HOST_KEY_CHECKING: "False" - name: Send Slack notification on failure - if: failure() && steps.ansible-deploy.outcome == 'failure' + if: failure() uses: slackapi/slack-github-action@v1 with: payload: | From 2f7fa822d47767b21c3ae6dadfc0625501b697ff Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 14:08:41 +0000 Subject: [PATCH 11/17] ansible: fix requirements.yml - artis3n.tailscale is a role not a collection Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/requirements.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/requirements.yml b/ansible/requirements.yml index 9141d26..2495fbd 100644 --- a/ansible/requirements.yml +++ b/ansible/requirements.yml @@ -1,4 +1,4 @@ --- -collections: +roles: - name: artis3n.tailscale - version: ">=4.0.0" + version: "4.5.0" From d5f614e1746f9a6def02ec1faf43bc49b411380b Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 14:10:46 +0000 Subject: [PATCH 12/17] ansible: fix tailscale dependency - use collection format matching fa repo - Remove version pin (let Galaxy resolve latest) - Use artis3n.tailscale.machine (collection role) matching fa/infra/ansible pattern Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/requirements.yml | 3 +-- ansible/tasks/setup_tailscale.yaml | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ansible/requirements.yml b/ansible/requirements.yml index 2495fbd..a0c53c6 100644 --- a/ansible/requirements.yml +++ b/ansible/requirements.yml @@ -1,4 +1,3 @@ --- -roles: +collections: - name: artis3n.tailscale - version: "4.5.0" diff --git a/ansible/tasks/setup_tailscale.yaml b/ansible/tasks/setup_tailscale.yaml index 11ca6d2..f885661 100644 --- a/ansible/tasks/setup_tailscale.yaml +++ b/ansible/tasks/setup_tailscale.yaml @@ -1,7 +1,7 @@ --- - name: Include Tailscale role ansible.builtin.include_role: - name: artis3n.tailscale + name: artis3n.tailscale.machine vars: tailscale_authkey: "{{ tailscale_auth_key }}" tailscale_args: "--ssh --hostname {{ inventory_hostname }}" From 66e8587ae29ab34e4eed6faf078d72617b7542e5 Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 15:31:36 +0000 Subject: [PATCH 13/17] ansible: fix vector config - use include_units instead of units Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/templates/vector.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/templates/vector.yaml.j2 b/ansible/templates/vector.yaml.j2 index 5b06e65..de5743b 100644 --- a/ansible/templates/vector.yaml.j2 +++ b/ansible/templates/vector.yaml.j2 @@ -2,7 +2,7 @@ sources: smart_git_proxy_logs: type: journald - units: + include_units: - smart-git-proxy transforms: From 734002adad3556ad6e2d3badec27173ee8bd9aa7 Mon Sep 17 00:00:00 2001 From: Piotr Bejda Date: Wed, 17 Jun 2026 11:31:43 -0400 Subject: [PATCH 14/17] ansible: secrets --- ansible/secrets.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/ansible/secrets.yml b/ansible/secrets.yml index f0c5c9c..3dd4a09 100644 --- a/ansible/secrets.yml +++ b/ansible/secrets.yml @@ -1,3 +1,12 @@ -# Encrypted with ansible-vault. -# Contains: github_token, axiom_token, tailscale_auth_key -# To edit: ansible-vault edit secrets.yml +$ANSIBLE_VAULT;1.1;AES256 +35386437653735306338376264643236303766323334656130363862353332333838666262316262 +6437376137356362636362316666386436313062376338640a303862636133323861633664626335 +63643534613963636137663533396433343833376266313031386134663166633065343438616537 +3166663338663162340a386163353433663036653236613438366631303431323463633633626265 +64656438313266303662333539313432386163373864323136663137336665383938343932666233 +64346165643261616265343861633163363462366235343639343165366438663364306638396335 +33636238386338366331336662383137376262303061326234323434383835373564346266626532 +34356632303836396662323063323733373031373735373366386235666365346537373837306261 +36323065333933623963633835306535313835353430623365373038646665333666633037343830 +36353135663563346134336361626133663238386532393863316336323932303661666636323161 +663866333462633535396465346166336132 From d33b5a034e99da3ff575335d8fc237bff1896bea Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 15:45:27 +0000 Subject: [PATCH 15/17] ansible: fix VRL merge calls to use infallible merge!() Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/templates/vector.yaml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/templates/vector.yaml.j2 b/ansible/templates/vector.yaml.j2 index de5743b..e2c698f 100644 --- a/ansible/templates/vector.yaml.j2 +++ b/ansible/templates/vector.yaml.j2 @@ -14,11 +14,11 @@ transforms: if exists(.message) { parsed, err = parse_json(.message) if err == null { - . = merge(., parsed) + . = merge!(., parsed) } } - . = merge(., { + . = merge!(., { "host": "${HOSTNAME}", "environment": "{{ env }}", "region": "{{ region }}", From fce24a5b03e4563d5d926de909f080c53b0f8795 Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 15:57:36 +0000 Subject: [PATCH 16/17] ansible: always restart Vector and reset failed state before start Prevents stale systemd failed state from blocking subsequent runs. Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/tasks/setup_axiom.yaml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml index cf3a2f9..ec0be73 100644 --- a/ansible/tasks/setup_axiom.yaml +++ b/ansible/tasks/setup_axiom.yaml @@ -43,15 +43,13 @@ mode: "0600" register: vector_config +- name: Reset Vector failed state + ansible.builtin.command: systemctl reset-failed vector + changed_when: false + failed_when: false + - name: Enable and restart Vector systemd: name: vector state: restarted enabled: yes - when: vector_config.changed or vector_systemd_override.changed - -- name: Ensure Vector is running - systemd: - name: vector - state: started - enabled: yes From add8e1595aa82ead0d56df35f78c8d9bff530d7a Mon Sep 17 00:00:00 2001 From: piotr Date: Wed, 17 Jun 2026 16:10:18 +0000 Subject: [PATCH 17/17] ansible: fix vector config ownership - must be readable by vector user Co-Authored-By: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- ansible/tasks/setup_axiom.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml index ec0be73..e27c856 100644 --- a/ansible/tasks/setup_axiom.yaml +++ b/ansible/tasks/setup_axiom.yaml @@ -40,6 +40,8 @@ template: src: templates/vector.yaml.j2 dest: /etc/vector/vector.yaml + owner: vector + group: vector mode: "0600" register: vector_config