diff --git a/.github/workflows/build-and-deploy.yaml b/.github/workflows/build-and-deploy.yaml new file mode 100644 index 0000000..a02d136 --- /dev/null +++ b/.github/workflows/build-and-deploy.yaml @@ -0,0 +1,184 @@ +name: Build and Deploy + +on: + push: + branches: + - main + - production + pull_request: + workflow_dispatch: + inputs: + branch: + description: "Branch to deploy (main/production)" + required: true + type: choice + options: + - main + - production + default: "main" + run_deployment: + description: "Run ansible deployment after build" + required: true + type: boolean + default: false + +concurrency: + group: "build-and-deploy-${{ github.ref == 'refs/heads/production' && 'production' || github.event.inputs.branch == 'production' && 'production' || 'staging' }}" + cancel-in-progress: false + +jobs: + build-and-deploy: + runs-on: blacksmith-8vcpu-ubuntu-2204 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch || github.ref }} + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Build binary + run: | + export GO=go + make build + cp bin/smart-git-proxy ./smart-git-proxy + + - name: Run tests + if: github.event_name == 'pull_request' + run: go test ./... + + - name: Install rclone + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch' + run: curl https://rclone.org/install.sh | sudo bash + + - name: Configure rclone + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch' + env: + R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} + R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} + run: | + mkdir -p ~/.config/rclone + printf '[r2]\ntype = s3\nprovider = Cloudflare\naccess_key_id = %s\nsecret_access_key = %s\nendpoint = https://1ede90a8395416f286ba9f692dc6bacf.r2.cloudflarestorage.com\n' \ + "$R2_ACCESS_KEY_ID" "$R2_SECRET_ACCESS_KEY" > ~/.config/rclone/rclone.conf + + - name: Push binary to R2 + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || github.event_name == 'workflow_dispatch' + run: | + SHA=$(git rev-parse HEAD) + echo "SHA=$SHA" >> $GITHUB_ENV + if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then + rclone copy ./smart-git-proxy r2:useblacksmith/smart-git-proxy/production/$SHA + else + rclone copy ./smart-git-proxy r2:useblacksmith/smart-git-proxy/main/$SHA + fi + + - name: Determine deploy target + id: deploy-target + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' || (github.event_name == 'workflow_dispatch' && github.event.inputs.run_deployment == 'true') + run: | + if [[ "$GITHUB_REF" == "refs/heads/production" || "${{ github.event.inputs.branch }}" == "production" ]]; then + echo "env=production" >> $GITHUB_OUTPUT + echo "branch=production" >> $GITHUB_OUTPUT + else + echo "env=staging" >> $GITHUB_OUTPUT + echo "branch=${{ github.event.inputs.branch || github.ref_name }}" >> $GITHUB_OUTPUT + fi + + - name: Set up Python + if: steps.deploy-target.outcome == 'success' + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Install Ansible + if: steps.deploy-target.outcome == 'success' + run: | + python -m pip install --upgrade pip + pip install ansible + + - name: Connect to Tailscale + if: steps.deploy-target.outcome == 'success' + uses: tailscale/github-action@v3 + with: + oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }} + oauth-secret: ${{ secrets.TS_OAUTH_SECRET }} + tags: tag:ci + + - name: Create Ansible Vault password file + if: steps.deploy-target.outcome == 'success' + run: echo "${{ secrets.ANSIBLE_SECRET }}" > ~/vault-password.txt + + - name: Check host connectivity + if: steps.deploy-target.outcome == 'success' + working-directory: ansible + run: | + sleep 10 + if [[ "${{ steps.deploy-target.outputs.env }}" == "production" ]]; then + INVENTORY_FILES="production-usw.ini production-euw.ini production-euc.ini" + else + INVENTORY_FILES="staging.ini" + fi + for INVENTORY_FILE in $INVENTORY_FILES; do + echo "=== Checking hosts in $INVENTORY_FILE ===" + HOSTS=$(ansible-inventory -i $INVENTORY_FILE --list | jq -r '._meta.hostvars | keys[]') + if [ -z "$HOSTS" ]; then + echo "warning: no hosts found in $INVENTORY_FILE, skipping" + continue + fi + for host in $HOSTS; do + echo "Testing connectivity to $host..." + start_time=$(date +%s) + while true; do + if tailscale ping -c 1 --timeout=5s $host >/dev/null 2>&1; then + echo "$host is reachable" + break + fi + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + if [ $elapsed -ge 30 ]; then + echo "error: timeout after 30s waiting for $host" + exit 1 + fi + echo "Waiting for $host... (${elapsed}s elapsed)" + sleep 5 + done + done + done + env: + ANSIBLE_HOST_KEY_CHECKING: "False" + + - name: Run Ansible rolling deploy + id: ansible-deploy + if: steps.deploy-target.outcome == 'success' + working-directory: ansible + run: | + BRANCH_ARG="-e branch=${{ steps.deploy-target.outputs.branch }}" + if [[ "${{ steps.deploy-target.outputs.env }}" == "production" ]]; then + for REGION_INI in production-usw.ini production-euw.ini production-euc.ini; do + echo "=== Rolling region: $REGION_INI ===" + ANSIBLE_CONFIG=./ansible.cfg ansible-playbook -i "$REGION_INI" \ + --vault-password-file ~/vault-password.txt \ + roll.yaml $BRANCH_ARG -v + done + else + ANSIBLE_CONFIG=./ansible.cfg ansible-playbook -i staging.ini \ + --vault-password-file ~/vault-password.txt \ + roll.yaml $BRANCH_ARG -v + fi + env: + ANSIBLE_HOST_KEY_CHECKING: "False" + + - name: Send Slack notification on failure + if: failure() + uses: slackapi/slack-github-action@v1 + with: + payload: | + { + "text": "Ansible deploy failed for smart-git-proxy! Branch: ${{ github.ref_name || github.event.inputs.branch }}, Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + } + env: + SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/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" diff --git a/.github/workflows/deploy-grafana-dashboards.yml b/.github/workflows/deploy-grafana-dashboards.yml new file mode 100644 index 0000000..a207aec --- /dev/null +++ b/.github/workflows/deploy-grafana-dashboards.yml @@ -0,0 +1,229 @@ +name: Deploy Dashboards & Alerts to Self-Hosted Grafana + +on: + push: + branches: [main, production] + paths: + - "grafana/dashboards/*.json" + - "grafana/alert-rules/*.yaml" + - "grafana/alert-rules/*.yml" + - ".github/workflows/deploy-grafana-dashboards.yml" + pull_request: + paths: + - "grafana/dashboards/*.json" + - "grafana/alert-rules/*.yaml" + - "grafana/alert-rules/*.yml" + - ".github/workflows/deploy-grafana-dashboards.yml" + workflow_dispatch: + +jobs: + deploy-dashboards: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set folder name + run: | + if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then + echo "FOLDER_NAME=Smart Git Proxy Production" >> $GITHUB_ENV + else + echo "FOLDER_NAME=Smart Git Proxy Staging" >> $GITHUB_ENV + fi + + - name: Validate JSON syntax + run: | + for dashboard in grafana/dashboards/*.json; do + echo "Validating $dashboard..." + jq . "$dashboard" > /dev/null || exit 1 + done + echo "All dashboard JSON files are valid" + + - name: Deploy dashboards to Self-Hosted Grafana + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' + env: + GRAFANA_URL: ${{ secrets.SELF_HOSTED_GRAFANA_URL }} + GRAFANA_USER: ${{ secrets.SELF_HOSTED_GRAFANA_USER }} + GRAFANA_PASSWORD: ${{ secrets.SELF_HOSTED_GRAFANA_PASSWORD }} + run: | + if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then + UID_SUFFIX="-prod-self" + else + UID_SUFFIX="-staging-self" + fi + + AUTH_HEADER="Authorization: Basic $(echo -n "$GRAFANA_USER:$GRAFANA_PASSWORD" | base64)" + + # Ensure folder exists + FOLDERS_RESPONSE=$(curl -s -H "$AUTH_HEADER" "$GRAFANA_URL/api/folders") + FOLDER_UID=$(echo "$FOLDERS_RESPONSE" | jq -r --arg name "$FOLDER_NAME" '.[] | select(.title == $name) | .uid' | head -1) + + if [ -z "$FOLDER_UID" ] || [ "$FOLDER_UID" == "null" ]; then + FOLDER_UID=$(echo "$FOLDER_NAME" | tr '[:upper:]' '[:lower:]' | tr ' ' '-')-$(date +%s) + CREATE_RESPONSE=$(curl -s -X POST \ + -H "$AUTH_HEADER" \ + -H "Content-Type: application/json" \ + -d "{\"title\": \"$FOLDER_NAME\", \"uid\": \"$FOLDER_UID\"}" \ + "$GRAFANA_URL/api/folders") + if ! echo "$CREATE_RESPONSE" | grep -q '"uid"'; then + echo "Error creating folder: $CREATE_RESPONSE" + exit 1 + fi + fi + + echo "FOLDER_UID=$FOLDER_UID" >> $GITHUB_ENV + + for dashboard_file in grafana/dashboards/*.json; do + if [ -f "$dashboard_file" ]; then + dashboard_name=$(basename "$dashboard_file" .json) + echo "Uploading $dashboard_name..." + + DASHBOARD_JSON=$(cat "$dashboard_file") + + # Append environment suffix to UID + if echo "$DASHBOARD_JSON" | jq -e '.uid' > /dev/null 2>&1; then + ORIGINAL_UID=$(echo "$DASHBOARD_JSON" | jq -r '.uid') + DASHBOARD_JSON=$(echo "$DASHBOARD_JSON" | jq --arg uid "${ORIGINAL_UID}${UID_SUFFIX}" '.uid = $uid') + else + DASHBOARD_JSON=$(echo "$DASHBOARD_JSON" | jq --arg uid "${dashboard_name}${UID_SUFFIX}" '. + {uid: $uid}') + fi + + PAYLOAD_FILE=$(mktemp) + echo "$DASHBOARD_JSON" | jq \ + --arg folderUid "$FOLDER_UID" \ + '{dashboard: ., folderUid: $folderUid, overwrite: true}' > "$PAYLOAD_FILE" + + RESPONSE=$(curl -s -X POST \ + -H "$AUTH_HEADER" \ + -H "Content-Type: application/json" \ + --data-binary "@$PAYLOAD_FILE" \ + "$GRAFANA_URL/api/dashboards/db") + rm -f "$PAYLOAD_FILE" + + if echo "$RESPONSE" | grep -q '"status":"success"'; then + echo " Uploaded $dashboard_name (version: $(echo "$RESPONSE" | jq -r .version))" + else + echo " Failed: $RESPONSE" + echo " Continuing..." + fi + fi + done + + - name: Deploy alert rules to Self-Hosted Grafana + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/production' + env: + GRAFANA_URL: ${{ secrets.SELF_HOSTED_GRAFANA_URL }} + GRAFANA_USER: ${{ secrets.SELF_HOSTED_GRAFANA_USER }} + GRAFANA_PASSWORD: ${{ secrets.SELF_HOSTED_GRAFANA_PASSWORD }} + run: | + AUTH_HEADER="Authorization: Basic $(echo -n "$GRAFANA_USER:$GRAFANA_PASSWORD" | base64)" + + if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then + DASHBOARD_UID_SUFFIX="-prod-self" + else + DASHBOARD_UID_SUFFIX="-staging-self" + fi + + # Resolve Prometheus datasource UID + DATASOURCES_RESPONSE=$(curl -s -H "$AUTH_HEADER" "$GRAFANA_URL/api/datasources") + DATASOURCE_UID=$(echo "$DATASOURCES_RESPONSE" | jq -r '[.[] | select(.type=="prometheus")] | .[0].uid // empty') + if [ -z "$DATASOURCE_UID" ] || [ "$DATASOURCE_UID" = "null" ]; then + echo "Could not resolve Prometheus datasource UID" + exit 1 + fi + echo "Using Prometheus datasource UID: $DATASOURCE_UID" + + # Use same folder as dashboards + ALERT_FOLDER_UID="${FOLDER_UID}" + + shopt -s nullglob + alert_files=(grafana/alert-rules/*.yaml grafana/alert-rules/*.yml) + if [ ${#alert_files[@]} -eq 0 ]; then + echo "No alert rule files found, skipping" + exit 0 + fi + + # Install yq + if ! command -v yq &> /dev/null; then + wget -q https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O yq + chmod +x yq + YQ_CMD="./yq" + else + YQ_CMD="yq" + fi + + for alert_file in "${alert_files[@]}"; do + alert_name=$(basename "$alert_file" .yaml) + alert_name=$(basename "$alert_name" .yml) + echo "Uploading alert rules from $alert_name..." + + ALERT_JSON=$($YQ_CMD eval -o=json "$alert_file") + + # Replace datasource placeholders and template variables + ALERT_JSON=$(echo "$ALERT_JSON" | jq --arg uid "$DATASOURCE_UID" ' + .groups |= map( + del(.folder) | + .rules |= map( + .data |= map( + (if (has("datasourceUid") and .datasourceUid == "${datasource}") then + .datasourceUid = $uid + else . end) | + if has("model") and (.model | has("expr")) then + .model.expr = (.model.expr | + gsub("\\$instance"; ".*") | + gsub("instance=~\"\\*\""; "instance=~\".*\"")) + else . end + ) + ) + )') + + GROUPS_COUNT=$(echo "$ALERT_JSON" | jq '.groups | length') + for group_idx in $(seq 0 $((GROUPS_COUNT - 1))); do + RULES_COUNT=$(echo "$ALERT_JSON" | jq --argjson g "$group_idx" '.groups[$g].rules | length') + GROUP_NAME=$(echo "$ALERT_JSON" | jq -r --argjson g "$group_idx" '.groups[$g].name') + GROUP_INTERVAL=$(echo "$ALERT_JSON" | jq -r --argjson g "$group_idx" '.groups[$g].interval // "1m"') + + for rule_idx in $(seq 0 $((RULES_COUNT - 1))); do + RULE=$(echo "$ALERT_JSON" | jq --argjson g "$group_idx" --argjson idx "$rule_idx" '.groups[$g].rules[$idx]') + ORIGINAL_RULE_UID=$(echo "$RULE" | jq -r '.uid') + + if [[ "${{ github.ref }}" == "refs/heads/production" ]]; then + RULE_UID="${ORIGINAL_RULE_UID}_p" + else + RULE_UID="${ORIGINAL_RULE_UID}_s" + fi + + RULE_PAYLOAD=$(echo "$RULE" | jq \ + --arg folderUid "$ALERT_FOLDER_UID" \ + --arg groupName "$GROUP_NAME" \ + --arg interval "$GROUP_INTERVAL" \ + --arg ruleUid "$RULE_UID" ' + . + {uid: $ruleUid, folderUID: $folderUid, ruleGroup: $groupName, interval: $interval}') + + RULE_PAYLOAD=$(echo "$RULE_PAYLOAD" | jq --arg suffix "$DASHBOARD_UID_SUFFIX" ' + if (.annotations.__dashboardUid__ // "") != "" then + .annotations.__dashboardUid__ += $suffix + else . end') + + # Upsert rule + EXISTING_RULE=$(curl -s -H "$AUTH_HEADER" "$GRAFANA_URL/api/v1/provisioning/alert-rules/$RULE_UID") + if echo "$EXISTING_RULE" | grep -q '"uid"'; then + RESPONSE=$(curl -s -X PUT \ + -H "$AUTH_HEADER" -H "Content-Type: application/json" -H "X-Disable-Provenance: true" \ + -d "$RULE_PAYLOAD" "$GRAFANA_URL/api/v1/provisioning/alert-rules/$RULE_UID") + else + RESPONSE=$(curl -s -X POST \ + -H "$AUTH_HEADER" -H "Content-Type: application/json" -H "X-Disable-Provenance: true" \ + -d "$RULE_PAYLOAD" "$GRAFANA_URL/api/v1/provisioning/alert-rules") + fi + + if echo "$RESPONSE" | grep -q '"uid"'; then + echo " Processed rule: $RULE_UID" + else + echo " Failed rule $RULE_UID: $RESPONSE" + exit 1 + fi + done + done + done + + echo "Alert rules deployment complete!" diff --git a/ansible/README.md b/ansible/README.md new file mode 100644 index 0000000..bc999f0 --- /dev/null +++ b/ansible/README.md @@ -0,0 +1,101 @@ +# Ansible Deployment + +Playbooks and templates for deploying smart-git-proxy to dedicated proxy nodes. + +## Directory Structure + +``` +ansible/ + hydrate.yaml # Full bootstrapping: hostname, packages, Tailscale, + # NVMe, Alloy, service config, Vector logging + setup.yaml # Incremental setup: NVMe, Alloy, service config, Vector + roll.yaml # Rolling deploy: build or download binary, restart + requirements.yml # Ansible Galaxy dependencies (artis3n.tailscale) + tasks/ + setup_hostname.yaml # Set hostname and /etc/hosts + install_dependencies.yaml # apt packages (git, make, jq, xfsprogs, etc.) + setup_tailscale.yaml # Install and configure Tailscale with SSH + configure_tailscale_firewall.yaml # UFW rules to block netscanning + setup_nvme.yaml # Detect, format (XFS), and mount NVMe data drive + setup_grafana_alloy.yaml # Prometheus scrape → remote write to Grafana + setup_service.yaml # systemd unit and env config + setup_axiom.yaml # Vector → Axiom log forwarding + templates/ + smart-git-proxy.service.j2 # systemd unit + smart-git-proxy-env.j2 # Environment config + vector.yaml.j2 # Vector config for Axiom + production-usw.ini # US-West inventory (3 nodes) + production-euw.ini # EU-West inventory (placeholder) + production-euc.ini # EU-Central inventory (placeholder) + staging.ini # Staging inventory (1 node) + secrets.yml # ansible-vault encrypted secrets +``` + +## Prerequisites + +```bash +# Install Ansible Galaxy dependencies +ansible-galaxy collection install -r requirements.yml +``` + +## Usage + +### Hydrate Raw Machines (once per node, first time) + +Bootstraps a freshly provisioned machine from scratch: sets hostname, installs +packages, configures Tailscale + firewall, formats NVMe, sets up Alloy metrics, +systemd service, and Vector log forwarding. + +```bash +ansible-playbook -i staging.ini hydrate.yaml --vault-password-file ~/vault-password.txt +ansible-playbook -i production-usw.ini hydrate.yaml --vault-password-file ~/vault-password.txt +``` + +### Incremental Setup (re-run safe) + +Updates service config, systemd unit, Alloy, and Vector without touching +Tailscale or re-formatting NVMe. Safe to re-run. + +```bash +ansible-playbook -i production-usw.ini setup.yaml --vault-password-file ~/vault-password.txt +``` + +### Deploy from Branch + +```bash +ansible-playbook -i production-usw.ini roll.yaml -e branch=main --vault-password-file ~/vault-password.txt +``` + +### Deploy from Release + +```bash +ansible-playbook -i production-usw.ini roll.yaml -e release_tag=v1.0.0 --vault-password-file ~/vault-password.txt +``` + +## Configuration + +Key environment variables (set in `templates/smart-git-proxy-env.j2`): + +| Variable | Default | Description | +|----------|---------|-------------| +| `LISTEN_ADDR` | `:8080` | HTTP listen address | +| `MIRROR_DIR` | `/mnt/nvme/mirrors` | Path for bare git mirrors | +| `MIRROR_MAX_SIZE` | `80%` | LRU eviction threshold | +| `SYNC_STALE_AFTER` | `2s` | Upstream sync staleness window | +| `AUTH_MODE` | `pass-through` | Forward client's GitHub token upstream | + +## Secrets + +`secrets.yml` must contain (ansible-vault encrypted): +- `github_token` — GitHub token for cloning the repo during branch builds +- `axiom_token` — Axiom API token for log forwarding +- `tailscale_auth_key` — Tailscale auth key (tag: `git-proxy`) + +## Inventory + +Hosts are listed by public IP for initial hydration (SSH over public internet). +After hydration, Tailscale IPs are used for rolling deploys via CI (the +build-and-deploy workflow connects through Tailscale VPN). + +After hydration, note each node's Tailscale IP and update the inventory files +if you want to switch to Tailscale-based SSH for subsequent runs. diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000..6dca555 --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,6 @@ +[defaults] +host_key_checking = False +timeout = 30 + +[galaxy] +collections_path = ~/.ansible/collections diff --git a/ansible/hydrate.yaml b/ansible/hydrate.yaml new file mode 100644 index 0000000..fd3c219 --- /dev/null +++ b/ansible/hydrate.yaml @@ -0,0 +1,38 @@ +--- +# Full hydration for raw machines: hostname, packages, Tailscale, NVMe, service config, logging. +# Run once per node after provisioning. +# +# Usage: +# ansible-playbook -i staging.ini hydrate.yaml --vault-password-file ~/vault-password.txt +# ansible-playbook -i production-usw.ini hydrate.yaml --vault-password-file ~/vault-password.txt + +- name: Hydrate Smart Git Proxy Nodes + hosts: all + become: yes + serial: 4 + vars_files: + - secrets.yml + tasks: + - name: Set hostname + include_tasks: tasks/setup_hostname.yaml + + - name: Install base packages + include_tasks: tasks/install_dependencies.yaml + + - name: Set up Tailscale + include_tasks: tasks/setup_tailscale.yaml + + - name: Configure Tailscale firewall rules + include_tasks: tasks/configure_tailscale_firewall.yaml + + - name: Set up NVMe storage + include_tasks: tasks/setup_nvme.yaml + + - name: Set up Grafana Alloy for metrics + include_tasks: tasks/setup_grafana_alloy.yaml + + - name: Set up service config and systemd + include_tasks: tasks/setup_service.yaml + + - name: Set up Axiom log forwarding + include_tasks: tasks/setup_axiom.yaml diff --git a/ansible/production-euc.ini b/ansible/production-euc.ini new file mode 100644 index 0000000..5d7ee50 --- /dev/null +++ b/ansible/production-euc.ini @@ -0,0 +1,9 @@ +# EU-Central proxy nodes (2 nodes) +# Update with actual IPs after provisioning. +[all:vars] +region=eu-central +env=production + +[proxy] +# proxy-euc-1 ansible_host= +# proxy-euc-2 ansible_host= diff --git a/ansible/production-euw.ini b/ansible/production-euw.ini new file mode 100644 index 0000000..dde0031 --- /dev/null +++ b/ansible/production-euw.ini @@ -0,0 +1,9 @@ +# EU-West proxy nodes (2 nodes) +# Update with actual IPs after provisioning. +[all:vars] +region=eu-west +env=production + +[proxy] +# proxy-euw-1 ansible_host= +# proxy-euw-2 ansible_host= diff --git a/ansible/production-usw.ini b/ansible/production-usw.ini new file mode 100644 index 0000000..f17ada7 --- /dev/null +++ b/ansible/production-usw.ini @@ -0,0 +1,15 @@ +# US-West proxy nodes (3 nodes, s4.s2.large, PhoenixNAP, 36mo reservation) +# Provisioned 2026-06-17 +# +# Host keys are Tailscale hostnames (resolved via MagicDNS from CI). +# ansible_host is the public IP (used for initial hydration before Tailscale). +# After hydration, remove ansible_host lines to connect via Tailscale. +[all:vars] +ansible_user=ubuntu +region=us-west +env=production + +[proxy] +git-proxy-usw-1 ansible_host=192.240.240.207 server_id=6a329c5bcd195a90018570da +git-proxy-usw-2 ansible_host=192.240.240.208 server_id=6a329c5dcd195a90018570db +git-proxy-usw-3 ansible_host=192.240.240.209 server_id=6a329c5ecd195a90018570dc diff --git a/ansible/requirements.yml b/ansible/requirements.yml new file mode 100644 index 0000000..a0c53c6 --- /dev/null +++ b/ansible/requirements.yml @@ -0,0 +1,3 @@ +--- +collections: + - name: artis3n.tailscale diff --git a/ansible/roll.yaml b/ansible/roll.yaml new file mode 100644 index 0000000..8fe2186 --- /dev/null +++ b/ansible/roll.yaml @@ -0,0 +1,113 @@ +--- +# Rolling deploy: build from branch (or download release), install binary, restart. +# Usage: +# ansible-playbook -i production-usw.ini roll.yaml +# ansible-playbook -i production-usw.ini roll.yaml -e branch=feat/my-change + +- name: Roll Smart Git Proxy + hosts: all + become: yes + serial: 1 + vars_files: + - secrets.yml + vars: + branch: "" + release_tag: "" + tasks: + - name: Determine environment from inventory file name + set_fact: + ENV: "{{ 'production' if 'production' in inventory_file else 'staging' if 'staging' in inventory_file else '' }}" + + - name: Ensure ENV is determined + fail: + msg: "Failed to determine environment from inventory file name." + when: ENV == '' + + # --- Branch build path --- + - name: Install Go for branch build + shell: | + if /usr/local/go/bin/go version 2>/dev/null | grep -q 'go1.25'; then + echo "Go already installed" + exit 0 + fi + wget -q https://go.dev/dl/go1.25.0.linux-amd64.tar.gz -O /tmp/go.tar.gz + rm -rf /usr/local/go + tar -C /usr/local -xzf /tmp/go.tar.gz + rm /tmp/go.tar.gz + when: branch != "" + + - name: Clone repo at branch + git: + repo: "https://{{ github_token }}@github.com/useblacksmith/smart-git-proxy.git" + dest: /tmp/smart-git-proxy-build + version: "{{ branch }}" + force: yes + when: branch != "" + + - name: Build from branch + shell: | + set -euo pipefail + export PATH=/usr/local/go/bin:$PATH + export GO=go + cd /tmp/smart-git-proxy-build + make build + args: + executable: /bin/bash + when: branch != "" + + - name: Install branch binary + copy: + src: /tmp/smart-git-proxy-build/bin/smart-git-proxy + dest: /usr/local/bin/smart-git-proxy + mode: "0755" + remote_src: yes + when: branch != "" + + - name: Clean up build directory + file: + path: /tmp/smart-git-proxy-build + state: absent + when: branch != "" + + # --- Release download path --- + # Goreleaser publishes archives as: smart-git-proxy__linux_amd64.tar.gz + - name: Download release archive + get_url: + url: "https://github.com/useblacksmith/smart-git-proxy/releases/download/{{ release_tag }}/smart-git-proxy_{{ release_tag | regex_replace('^v', '') }}_linux_amd64.tar.gz" + dest: /tmp/smart-git-proxy-release.tar.gz + force: yes + when: branch == "" and release_tag != "" + + - name: Extract release binary + shell: | + set -euo pipefail + mkdir -p /tmp/smart-git-proxy-release + tar -xzf /tmp/smart-git-proxy-release.tar.gz -C /tmp/smart-git-proxy-release + install -m 0755 /tmp/smart-git-proxy-release/smart-git-proxy /usr/local/bin/smart-git-proxy + rm -rf /tmp/smart-git-proxy-release /tmp/smart-git-proxy-release.tar.gz + args: + executable: /bin/bash + when: branch == "" and release_tag != "" + + # --- Validate deploy target --- + - name: Fail if neither branch nor release_tag provided + fail: + msg: "Must specify either -e branch= or -e release_tag=" + when: branch == "" and release_tag == "" + + # --- Restart --- + - name: Restart smart-git-proxy + systemd: + name: smart-git-proxy + state: restarted + enabled: yes + + - name: Wait for health check + uri: + url: http://localhost:8080/healthz + status_code: 200 + timeout: 5 + register: health + retries: 10 + delay: 2 + until: health.status == 200 diff --git a/ansible/secrets.yml b/ansible/secrets.yml new file mode 100644 index 0000000..3dd4a09 --- /dev/null +++ b/ansible/secrets.yml @@ -0,0 +1,12 @@ +$ANSIBLE_VAULT;1.1;AES256 +35386437653735306338376264643236303766323334656130363862353332333838666262316262 +6437376137356362636362316666386436313062376338640a303862636133323861633664626335 +63643534613963636137663533396433343833376266313031386134663166633065343438616537 +3166663338663162340a386163353433663036653236613438366631303431323463633633626265 +64656438313266303662333539313432386163373864323136663137336665383938343932666233 +64346165643261616265343861633163363462366235343639343165366438663364306638396335 +33636238386338366331336662383137376262303061326234323434383835373564346266626532 +34356632303836396662323063323733373031373735373366386235666365346537373837306261 +36323065333933623963633835306535313835353430623365373038646665333666633037343830 +36353135663563346134336361626133663238386532393863316336323932303661666636323161 +663866333462633535396465346166336132 diff --git a/ansible/setup.yaml b/ansible/setup.yaml new file mode 100644 index 0000000..ce3d4d5 --- /dev/null +++ b/ansible/setup.yaml @@ -0,0 +1,24 @@ +--- +# Incremental setup: deploy service config, systemd unit, NVMe validation, Vector logging. +# For raw machine bootstrapping (including Tailscale, NVMe format, packages), use hydrate.yaml. +# +# Usage: +# ansible-playbook -i production-usw.ini setup.yaml --vault-password-file ~/vault-password.txt + +- name: Setup Smart Git Proxy + hosts: all + become: yes + vars_files: + - secrets.yml + tasks: + - name: Set up NVMe storage + include_tasks: tasks/setup_nvme.yaml + + - name: Set up Grafana Alloy for metrics + include_tasks: tasks/setup_grafana_alloy.yaml + + - name: Set up service config and systemd + include_tasks: tasks/setup_service.yaml + + - name: Set up Axiom log forwarding + include_tasks: tasks/setup_axiom.yaml diff --git a/ansible/staging.ini b/ansible/staging.ini new file mode 100644 index 0000000..a0fd252 --- /dev/null +++ b/ansible/staging.ini @@ -0,0 +1,13 @@ +# Staging proxy node (1 node, s4.s2.large, PhoenixNAP, 36mo reservation) +# Provisioned 2026-06-17 +# +# Host keys are Tailscale hostnames (resolved via MagicDNS from CI). +# ansible_host is the public IP (used for initial hydration before Tailscale). +# After hydration, remove ansible_host lines to connect via Tailscale. +[all:vars] +ansible_user=ubuntu +region=us-west +env=staging + +[proxy] +git-proxy-staging-1 ansible_host=192.240.240.210 server_id=6a329cb6cd195a90018570de diff --git a/ansible/tasks/configure_tailscale_firewall.yaml b/ansible/tasks/configure_tailscale_firewall.yaml new file mode 100644 index 0000000..1b41c03 --- /dev/null +++ b/ansible/tasks/configure_tailscale_firewall.yaml @@ -0,0 +1,43 @@ +--- +- name: Ensure UFW is installed + apt: + name: ufw + state: present + +- name: Enable UFW with default allow + ufw: + state: enabled + policy: allow + direction: incoming + logging: "on" + +- name: Allow Tailscale traffic within Tailscale network + ufw: + rule: allow + direction: out + to_ip: 100.64.0.0/10 + port: "41641" + proto: udp + +- name: Block Tailscale UDP scanning to private ranges + ufw: + rule: deny + direction: out + to_ip: "{{ item }}" + port: "41641" + proto: udp + loop: + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + +- name: Block all other Tailscale outbound scanning + ufw: + rule: deny + direction: out + port: "41641" + proto: udp + +- name: Reload UFW + ufw: + state: reloaded diff --git a/ansible/tasks/install_dependencies.yaml b/ansible/tasks/install_dependencies.yaml new file mode 100644 index 0000000..6f84aa5 --- /dev/null +++ b/ansible/tasks/install_dependencies.yaml @@ -0,0 +1,24 @@ +--- +- name: Wait for dpkg lock + shell: while lsof /var/lib/dpkg/lock-frontend 2>/dev/null; do sleep 10; done; + +- name: Update apt cache + apt: + update_cache: yes + retries: 5 + delay: 10 + register: apt_update + until: apt_update is success + +- name: Install required packages + apt: + name: + - curl + - wget + - git + - make + - jq + - xfsprogs + - nvme-cli + - ufw + state: present diff --git a/ansible/tasks/setup_axiom.yaml b/ansible/tasks/setup_axiom.yaml new file mode 100644 index 0000000..e27c856 --- /dev/null +++ b/ansible/tasks/setup_axiom.yaml @@ -0,0 +1,57 @@ +--- +- name: Add Vector repository + ansible.builtin.shell: | + bash -c "$(curl -L https://setup.vector.dev)" + args: + creates: /usr/share/keyrings/vector-archive-keyring.gpg + +- name: Install Vector + ansible.builtin.apt: + name: vector + state: present + update_cache: yes + +- name: Get hostname + shell: hostname + register: actual_hostname + changed_when: false + +- name: Create systemd override directory for Vector + file: + path: /etc/systemd/system/vector.service.d + state: directory + mode: "0755" + +- name: Configure Vector systemd override + copy: + content: | + [Service] + Environment="HOSTNAME={{ actual_hostname.stdout }}" + dest: /etc/systemd/system/vector.service.d/override.conf + mode: "0644" + register: vector_systemd_override + +- name: Reload systemd if Vector override changed + systemd: + daemon_reload: yes + when: vector_systemd_override.changed + +- name: Deploy Vector config + template: + src: templates/vector.yaml.j2 + dest: /etc/vector/vector.yaml + owner: vector + group: vector + mode: "0600" + register: vector_config + +- name: Reset Vector failed state + ansible.builtin.command: systemctl reset-failed vector + changed_when: false + failed_when: false + +- name: Enable and restart Vector + systemd: + name: vector + state: restarted + enabled: yes diff --git a/ansible/tasks/setup_grafana_alloy.yaml b/ansible/tasks/setup_grafana_alloy.yaml new file mode 100644 index 0000000..bb44c89 --- /dev/null +++ b/ansible/tasks/setup_grafana_alloy.yaml @@ -0,0 +1,78 @@ +--- +- name: Set Grafana endpoints + set_fact: + self_hosted_prometheus_url: "http://grafana.internal.blacksmith.sh:9090/api/v1/write" + +- name: Install Grafana Alloy + shell: | + ARCH=$(dpkg --print-architecture) + wget -q -O /tmp/alloy.deb "https://github.com/grafana/alloy/releases/download/v1.5.1/alloy-1.5.1-1.${ARCH}.deb" + DEBIAN_FRONTEND=noninteractive dpkg -i --force-confnew /tmp/alloy.deb || apt-get install -f -y + rm -f /tmp/alloy.deb + systemctl daemon-reload + args: + creates: /usr/bin/alloy + +- name: Create Alloy configuration directory + file: + path: /etc/alloy + state: directory + mode: "0755" + +- name: Configure Grafana Alloy + copy: + content: | + // Prometheus scrape of local smart-git-proxy /metrics endpoint + prometheus.scrape "smart_git_proxy" { + targets = [{ + __address__ = "127.0.0.1:8080", + }] + metrics_path = "/metrics" + scrape_interval = "15s" + forward_to = [prometheus.relabel.add_labels.receiver] + } + + // Add environment/region labels + prometheus.relabel "add_labels" { + forward_to = [prometheus.remote_write.self_hosted.receiver] + + rule { + target_label = "environment" + replacement = "{{ env }}" + } + rule { + target_label = "region" + replacement = "{{ region }}" + } + rule { + target_label = "job" + replacement = "smart-git-proxy" + } + rule { + target_label = "instance" + replacement = "{{ inventory_hostname }}" + } + } + + // Remote write to self-hosted Prometheus + prometheus.remote_write "self_hosted" { + endpoint { + url = "{{ self_hosted_prometheus_url }}" + } + } + dest: /etc/alloy/config.alloy + mode: "0644" + register: alloy_config + +- name: Enable and restart Alloy + systemd: + name: alloy + state: restarted + enabled: yes + when: alloy_config.changed + +- name: Ensure Alloy is running + systemd: + name: alloy + state: started + enabled: yes diff --git a/ansible/tasks/setup_hostname.yaml b/ansible/tasks/setup_hostname.yaml new file mode 100644 index 0000000..4679323 --- /dev/null +++ b/ansible/tasks/setup_hostname.yaml @@ -0,0 +1,19 @@ +--- +- name: Set hostname + hostname: + name: "{{ inventory_hostname }}" + +- name: Add hostname to /etc/hosts + lineinfile: + path: /etc/hosts + line: "{{ ansible_default_ipv4.address }} {{ inventory_hostname }}" + regexp: "^{{ ansible_default_ipv4.address }}" + state: present + +- name: Prevent cloud-init from overwriting hostname + copy: + content: | + #cloud-config + preserve_hostname: true + dest: /etc/cloud/cloud.cfg.d/99-preserve-hostname.cfg + mode: "0644" diff --git a/ansible/tasks/setup_nvme.yaml b/ansible/tasks/setup_nvme.yaml new file mode 100644 index 0000000..c9e9e87 --- /dev/null +++ b/ansible/tasks/setup_nvme.yaml @@ -0,0 +1,82 @@ +--- +# s4.s2.large machines have 1x 1TB OS + 2x 8TB NVMe data drives. +# We format and mount one data drive for git mirrors. + +- name: Get block device info + command: lsblk -J -o NAME,SIZE,TYPE,MOUNTPOINT,FSTYPE + register: lsblk_output + +- name: Parse block devices + set_fact: + block_devices: "{{ (lsblk_output.stdout | from_json).blockdevices }}" + +- name: Find data NVMe drives (>2TB, no OS partitions) + set_fact: + data_drives: >- + {%- set drives = [] -%} + {%- for device in block_devices -%} + {%- if device.type == 'disk' -%} + {%- set has_os = [] -%} + {%- if device.children is defined -%} + {%- for child in device.children -%} + {%- if child.mountpoint in ['/', '/boot', '/boot/efi'] -%} + {%- if has_os.append(true) -%}{%- endif -%} + {%- endif -%} + {%- if child.children is defined -%} + {%- for gc in child.children -%} + {%- if gc.mountpoint in ['/', '/boot', '/boot/efi'] -%} + {%- if has_os.append(true) -%}{%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set size_tb = device.size | regex_replace('[^0-9.]', '') | float -%} + {%- if has_os | length == 0 and device.mountpoint is none and size_tb > 2 -%} + {%- if drives.append(device.name) -%}{%- endif -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {{ drives }} + +- name: Display detected data drives + debug: + msg: "Data NVMe drives: {{ data_drives }}" + +- name: Fail if no data drives found + fail: + msg: "No NVMe data drives >2TB found. Expected s4.s2.large with 2x 8TB NVMe." + when: data_drives | length == 0 + +- name: Check if /mnt/nvme is already mounted + command: mountpoint -q /mnt/nvme + register: nvme_mounted + failed_when: false + changed_when: false + +- name: Format first data drive as XFS + filesystem: + fstype: xfs + dev: "/dev/{{ data_drives[0] }}" + force: no + when: nvme_mounted.rc != 0 + +- name: Create /mnt/nvme mount point + file: + path: /mnt/nvme + state: directory + mode: "0755" + +- name: Mount NVMe drive + mount: + path: /mnt/nvme + src: "/dev/{{ data_drives[0] }}" + fstype: xfs + opts: defaults,noatime + state: mounted + +- name: Create mirrors directory + file: + path: /mnt/nvme/mirrors + state: directory + mode: "0755" diff --git a/ansible/tasks/setup_service.yaml b/ansible/tasks/setup_service.yaml new file mode 100644 index 0000000..5e630d5 --- /dev/null +++ b/ansible/tasks/setup_service.yaml @@ -0,0 +1,33 @@ +--- +- name: Determine environment from inventory + set_fact: + ENV: "{{ env }}" + +- name: Ensure config directory exists + file: + path: /etc/smart-git-proxy + state: directory + mode: "0755" + +- name: Deploy environment config + template: + src: templates/smart-git-proxy-env.j2 + dest: /etc/smart-git-proxy/env + mode: "0600" + +- name: Deploy systemd unit + template: + src: templates/smart-git-proxy.service.j2 + dest: /etc/systemd/system/smart-git-proxy.service + mode: "0644" + register: systemd_unit + +- name: Reload systemd daemon + systemd: + daemon_reload: yes + when: systemd_unit.changed + +- name: Enable smart-git-proxy service + systemd: + name: smart-git-proxy + enabled: yes diff --git a/ansible/tasks/setup_tailscale.yaml b/ansible/tasks/setup_tailscale.yaml new file mode 100644 index 0000000..f885661 --- /dev/null +++ b/ansible/tasks/setup_tailscale.yaml @@ -0,0 +1,23 @@ +--- +- name: Include Tailscale role + ansible.builtin.include_role: + name: artis3n.tailscale.machine + vars: + tailscale_authkey: "{{ tailscale_auth_key }}" + tailscale_args: "--ssh --hostname {{ inventory_hostname }}" + tailscale_tags: "{{ ['git-proxy'] }}" + +- name: Ensure Tailscale is running + ansible.builtin.systemd: + name: tailscaled + state: started + enabled: yes + +- name: Get Tailscale IP + ansible.builtin.command: tailscale ip + register: tailscale_ip + changed_when: false + +- name: Display Tailscale IP + ansible.builtin.debug: + msg: "Tailscale IP for {{ inventory_hostname }}: {{ tailscale_ip.stdout }}" diff --git a/ansible/templates/smart-git-proxy-env.j2 b/ansible/templates/smart-git-proxy-env.j2 new file mode 100644 index 0000000..457b4ee --- /dev/null +++ b/ansible/templates/smart-git-proxy-env.j2 @@ -0,0 +1,10 @@ +# Smart Git Proxy configuration +# See https://github.com/useblacksmith/smart-git-proxy#configuration + +LISTEN_ADDR=:8080 +MIRROR_DIR={{ mirror_dir | default('/mnt/nvme/mirrors') }} +MIRROR_MAX_SIZE={{ mirror_max_size | default('80%') }} +SYNC_STALE_AFTER={{ sync_stale_after | default('2s') }} +ALLOWED_UPSTREAMS=github.com +AUTH_MODE=pass-through +LOG_LEVEL={{ log_level | default('info') }} diff --git a/ansible/templates/smart-git-proxy.service.j2 b/ansible/templates/smart-git-proxy.service.j2 new file mode 100644 index 0000000..dae6118 --- /dev/null +++ b/ansible/templates/smart-git-proxy.service.j2 @@ -0,0 +1,23 @@ +[Unit] +Description=Smart Git Proxy +After=network-online.target +Wants=network-online.target +StartLimitIntervalSec=300 +StartLimitBurst=10 + +[Service] +Type=exec +Environment="HOME=/root" +EnvironmentFile=/etc/smart-git-proxy/env +ExecStart=/usr/local/bin/smart-git-proxy +Restart=on-failure +RestartSec=10 +TimeoutStopSec=30 +KillMode=control-group +KillSignal=SIGTERM +StandardOutput=journal +StandardError=journal +SyslogIdentifier=smart-git-proxy + +[Install] +WantedBy=multi-user.target diff --git a/ansible/templates/vector.yaml.j2 b/ansible/templates/vector.yaml.j2 new file mode 100644 index 0000000..e2c698f --- /dev/null +++ b/ansible/templates/vector.yaml.j2 @@ -0,0 +1,37 @@ +# Vector config for shipping smart-git-proxy logs to Axiom. +sources: + smart_git_proxy_logs: + type: journald + include_units: + - smart-git-proxy + +transforms: + add_metadata: + type: remap + inputs: + - smart_git_proxy_logs + source: | + if exists(.message) { + parsed, err = parse_json(.message) + if err == null { + . = merge!(., parsed) + } + } + + . = merge!(., { + "host": "${HOSTNAME}", + "environment": "{{ env }}", + "region": "{{ region }}", + "service_name": "smart-git-proxy" + }) + +sinks: + axiom: + type: axiom + inputs: + - add_metadata + token: "{{ axiom_token }}" + dataset: smart-git-proxy + batch: + max_bytes: 1049000 + timeout_secs: 1 diff --git a/grafana/alert-rules/smart-git-proxy-alerts.yaml b/grafana/alert-rules/smart-git-proxy-alerts.yaml new file mode 100644 index 0000000..42d5296 --- /dev/null +++ b/grafana/alert-rules/smart-git-proxy-alerts.yaml @@ -0,0 +1,53 @@ +# Grafana alert rules for Smart Git Proxy. +# Import into Grafana via Alerting > Alert rules > Import. +# +# Metric names match internal/metrics/metrics.go: +# smart_git_proxy_requests_total (repo, kind, source) +# smart_git_proxy_responses_total (repo, kind, status) +# smart_git_proxy_errors_total (repo, kind) +# smart_git_proxy_request_seconds (repo, kind) [histogram] +# smart_git_proxy_sync_total (repo, result) + +groups: + - name: smart-git-proxy + interval: 1m + rules: + - uid: sgp-node-down + alert: ProxyNodeDown + expr: up{job="smart-git-proxy"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Smart git proxy node {{ $labels.instance }} is down" + description: "Health endpoint unreachable for 2 minutes." + + - uid: sgp-error-rate + alert: ErrorRateHigh + expr: sum(rate(smart_git_proxy_errors_total[5m])) by (instance) > 0.5 + for: 5m + labels: + severity: warning + annotations: + summary: "Elevated error rate on {{ $labels.instance }}" + description: "Errors at {{ $value | humanize }}/sec. May indicate upstream GitHub issues or local git failures." + + - uid: sgp-latency-high + alert: RequestLatencyHigh + expr: histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket[5m])) by (le, instance)) > 30 + for: 5m + labels: + severity: warning + annotations: + summary: "p95 request latency >30s on {{ $labels.instance }}" + description: "Slow git request serving — may indicate NVMe I/O pressure or excessive concurrent requests." + + - uid: sgp-sync-failures + alert: SyncFailureRateHigh + expr: sum(rate(smart_git_proxy_sync_total{result="error"}[5m])) by (instance) > 0.5 + for: 5m + labels: + severity: warning + annotations: + summary: "Elevated sync failures on {{ $labels.instance }}" + description: "Upstream GitHub sync failures at {{ $value | humanize }}/sec. May indicate rate limiting or network issues." diff --git a/grafana/dashboards/smart-git-proxy.json b/grafana/dashboards/smart-git-proxy.json new file mode 100644 index 0000000..fdfae93 --- /dev/null +++ b/grafana/dashboards/smart-git-proxy.json @@ -0,0 +1,218 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Requests / sec", + "description": "Rate of git requests served by the proxy (all kinds: info-refs, upload-pack).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, + "id": 1, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Requests by Kind", + "description": "Request rate broken down by kind (info-refs, upload-pack).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, + "id": 2, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (kind)", + "legendFormat": "{{ kind }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Request Latency (p50 / p95 / p99)", + "description": "Time to serve a git request from local NVMe mirror.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, + "id": 3, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le, instance))", + "legendFormat": "{{ instance }} p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le, instance))", + "legendFormat": "{{ instance }} p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(smart_git_proxy_request_seconds_bucket{instance=~\"$instance\"}[$__rate_interval])) by (le, instance))", + "legendFormat": "{{ instance }} p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Mirror Syncs / sec", + "description": "Rate of mirror sync operations (upstream fetches from GitHub), by result.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, + "id": 4, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_sync_total{instance=~\"$instance\"}[$__rate_interval])) by (result)", + "legendFormat": "{{ result }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }, + "id": 101, + "title": "Responses & Errors", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Response Status", + "description": "Responses by HTTP status code.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, + "id": 5, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_responses_total{instance=~\"$instance\"}[$__rate_interval])) by (status)", + "legendFormat": "{{ status }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Errors / sec", + "description": "Error rate by repo and kind.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }, + "id": 6, + "targets": [ + { + "expr": "sum(rate(smart_git_proxy_errors_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, + "id": 102, + "title": "Per-Repo Traffic", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "title": "Top Repos by Request Rate", + "description": "Request rate broken down by repository.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 27 }, + "id": 7, + "targets": [ + { + "expr": "topk(10, sum(rate(smart_git_proxy_requests_total{instance=~\"$instance\"}[$__rate_interval])) by (repo))", + "legendFormat": "{{ repo }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + } + ], + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "current": { "text": "Prometheus", "value": "PBFA97CFB590B2093" } + }, + { + "name": "instance", + "type": "query", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "query": "label_values(smart_git_proxy_requests_total, instance)", + "includeAll": true, + "allValue": ".*", + "multi": true, + "current": { "text": "All", "value": "$__all" } + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "utc", + "title": "Smart Git Proxy", + "uid": "smart-git-proxy", + "version": 1, + "schemaVersion": 36 +}