databricks · keugenek · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 8, 2025
@@ -0,0 +1,129 @@
+# Apps-MCP Evals
+
+Databricks Asset Bundle for generating and evaluating apps using the Apps-MCP system with klaudbiusz framework.
+
+## Overview
+
+This bundle provides two jobs:
+1. **Generation Job** - Generates apps using klaudbiusz with the Databricks CLI as MCP server
+2. **Evaluation Job** - Evaluates generated apps and logs results to MLflow
+
+## Prerequisites
+
+1. **Databricks Secrets** - Create secret scope and add tokens:
+   ```bash
+   databricks secrets create-scope apps-mcp-evals
+   databricks secrets put-secret apps-mcp-evals anthropic-api-key
+   databricks secrets put-secret apps-mcp-evals databricks-token
+   ```
+
+2. **UC Volumes** - Create volumes for artifacts:
+   ```bash
+   databricks volumes create main.default.apps_mcp_artifacts
+   databricks volumes create main.default.apps_mcp_generated
+   ```
+
+3. **CLI Binary** - Build and upload Linux CLI binary:
+   ```bash
+   GOOS=linux GOARCH=amd64 go build -o databricks-linux
+   databricks fs cp databricks-linux /Volumes/main/default/apps_mcp_artifacts/
+   ```
+
+## Quick Start
+
+```bash
+cd experimental/apps-mcp/evals
+
+# Validate bundle
+databricks bundle validate -t dev
+
+# Deploy
+databricks bundle deploy -t dev
+
+# Run generation (creates apps in UC Volume)
+databricks bundle run -t dev apps_generation_job
+
+# Run evaluation (evaluates apps, logs to MLflow)
+databricks bundle run -t dev apps_eval_job
+```
+
+## Jobs
+
+### Generation Job (`apps_generation_job`)
+
+Generates apps using klaudbiusz's local_run with LiteLLM backend.
+
+**Parameters:**
+- `prompts` - Prompt set: `databricks`, `databricks_v2`, or `test` (default: `test`)
+- `cli_binary_volume` - Path to CLI binary volume
+- `apps_volume` - Output volume for generated apps
+
+**Cluster:** Jobs cluster with Spark 16.2.x (Python 3.12)
+
+### Evaluation Job (`apps_eval_job`)
+
+Evaluates generated apps using klaudbiusz's Docker-based evaluation.
+
+**Parameters:**
+- `apps_volume` - Volume containing apps to evaluate
+- `mlflow_experiment` - MLflow experiment for logging results
+- `parallelism` - Number of parallel evaluations
+
+**Cluster:** Jobs cluster with Spark 16.2.x, Docker installed via init script
+
+**Schedule:** Nightly at 2am UTC
+
+## Configuration
+
+### Variables
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `prompts` | Prompt set for generation | `test` |
+| `cli_binary_volume` | UC Volume for CLI binary | `/Volumes/main/default/apps_mcp_artifacts` |
+| `apps_volume` | UC Volume for generated apps | `/Volumes/main/default/apps_mcp_generated` |
+| `mlflow_experiment` | MLflow experiment path | `/Shared/apps-mcp-evaluations` |
+| `eval_parallelism` | Parallel eval workers | `4` |
+| `evals_git_url` | klaudbiusz repo URL | `https://github.com/neondatabase/appdotbuild-agent.git` |
+
+### Targets
+
+- **dev** - Development mode, staging MLflow experiment
+- **prod** - Production mode, service principal identity
+
+## Monitoring
+
+- **MLflow** - View metrics at the configured experiment path
+- **Health Alerts** - Eval job alerts if runtime exceeds 2 hours
+- **Logs** - Check job run output for detailed evaluation results
+
+## Architecture
+
+```
+evals/
+├── databricks.yml              # Bundle configuration
+├── resources/
+│   ├── apps_generation_job.job.yml  # Generation job
+│   └── apps_eval_job.job.yml        # Evaluation job
+├── init/
+│   ├── setup_generation.sh     # Generation cluster init
+│   └── setup_eval.sh           # Eval cluster init (Docker)
+├── src/
+│   ├── generate_apps.py        # App generation orchestrator
+│   └── run_evals.py            # Evaluation orchestrator
+└── pyproject.toml              # Python package config
+```
+
+## Prompt Sets
+
+Available prompt sets (configured via `prompts` variable):
+
+- `test` - Simple test prompts (1 app) for quick validation
+- `databricks` - 5 Databricks-focused dashboard prompts
+- `databricks_v2` - 20 realistic human-style prompts
+
+## Known Limitations
+
+- Docker containers require `--privileged` flag on Databricks clusters
+- Generation uses LiteLLM backend (Claude Agent SDK has root user restriction)
+- UC Volumes don't support symlinks, uses `latest.txt` file instead
@@ -0,0 +1,62 @@
+# Databricks Asset Bundle for Apps-MCP Continuous Evals
+# See https://docs.databricks.com/dev-tools/bundles/index.html
+bundle:
+  name: apps-mcp-evals
+  uuid: 80e50a10-c2da-4b59-99d6-e101b1bcf485
+
+include:
+  - resources/*.yml
+
+artifacts:
+  apps_mcp_evals:
+    type: whl
+    build: uv build --wheel
+    path: .
+
+variables:
+  catalog:
+    description: Unity Catalog for eval results
+    default: main
+  schema:
+    description: Schema for eval tables
+  mlflow_experiment:
+    description: MLflow experiment path for tracking
+    default: /Shared/apps-mcp-evaluations
+  evals_git_url:
+    description: Git URL for appdotbuild-agent eval framework
+    default: https://github.com/neondatabase/appdotbuild-agent.git
+  eval_parallelism:
+    description: Number of parallel eval workers
+    default: "4"
+  cli_binary_volume:
+    description: UC Volume path for CLI binary
+    default: /Volumes/main/default/apps_mcp_artifacts
+  apps_volume:
+    description: UC Volume path for generated apps
+    default: /Volumes/main/default/apps_mcp_generated
+  generation_parallelism:
+    description: Number of parallel app generations
+    default: "4"
+  prompts:
+    description: Prompt set for generation (databricks, databricks_v2, test)
+    default: test
+
+targets:
+  dev:
+    mode: development
+    default: true
+    workspace:
+      host: https://6177827686947384.4.gcp.databricks.com
+    variables:
+      schema: ${workspace.current_user.short_name}
+      mlflow_experiment: /Shared/apps-mcp-evaluations-staging
+
+  prod:
+    mode: production
+    workspace:
+      host: https://6177827686947384.4.gcp.databricks.com
+      root_path: /Workspace/Users/${workspace.current_user.user_name}/.bundle/${bundle.name}/${bundle.target}
+    variables:
+      schema: evals
+    run_as:
+      service_principal_name: apps-mcp-eval-sp
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -e
+
+echo "=== Apps-MCP Eval Setup ==="
+echo "Python version: $(python --version)"
+
+# Install Node.js (required for klaudbiusz eval)
+echo "Installing Node.js..."
+curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -
+sudo apt-get install -y nodejs
+
+echo "Node version: $(node --version)"
+echo "npm version: $(npm --version)"
+
+# Install Docker (required for --no-dagger mode)
+echo "Installing Docker..."
+curl -fsSL https://get.docker.com -o get-docker.sh
+sudo sh get-docker.sh
+rm get-docker.sh
+
+# Configure Docker to use vfs storage driver (works without privileged mode)
+echo "Configuring Docker with vfs storage driver..."
+sudo mkdir -p /etc/docker
+cat <<EOF | sudo tee /etc/docker/daemon.json
+{
+  "storage-driver": "vfs"
+}
+EOF
+
+# Stop any existing Docker daemon
+sudo systemctl stop docker 2>/dev/null || true
+sudo pkill dockerd 2>/dev/null || true
+sleep 2
+
+# Start Docker daemon
+echo "Starting Docker daemon..."
+sudo dockerd --storage-driver=vfs &
+sleep 10
+
+# Verify Docker is running
+echo "Docker version: $(docker --version)"
+sudo docker info || echo "Warning: Docker daemon may not be fully started"
+
+# Allow non-root user to run docker
+sudo usermod -aG docker $(whoami) || true
+sudo chmod 666 /var/run/docker.sock || true
+
+# Pre-pull the node image to speed up evaluation
+echo "Pre-pulling node:20-alpine image..."
+docker pull node:20-alpine || echo "Warning: Could not pre-pull image"
+
+# Install Python dependencies
+pip install fire mlflow
+
+echo "=== Setup complete ==="
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+echo "=== Setting up generation environment ==="
+
+# Install Dagger (required for klaudbiusz container orchestration)
+echo "Installing Dagger..."
+curl -fsSL https://dl.dagger.io/dagger/install.sh | sh
+export PATH=$PATH:/root/.local/bin
+
+# Install Python dependencies for klaudbiusz
+echo "Installing Python dependencies..."
+pip install --quiet dagger-io fire tqdm python-dotenv claude-agent-sdk litellm joblib tenacity
+
+echo "=== Setup complete ==="
@@ -0,0 +1,24 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src"]
+
+[project]
+name = "apps_mcp_evals"
+version = "0.1.0"
+description = "Continuous evaluation framework for Apps-MCP code generation"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "mlflow>=2.15.0",
+    "fire>=0.7.1",
+]
+
+[project.scripts]
+main = "src.run_evals:cli"
+
+[tool.ruff]
+line-length = 120
+target-version = "py310"
@@ -0,0 +1,68 @@
+# Apps-MCP Evaluation Job
+# Runs nightly + supports manual trigger via: databricks bundle run -t dev apps_eval_job
+
+resources:
+  jobs:
+    apps_eval_job:
+      name: "[${bundle.target}] Apps-MCP Continuous Evals"
+
+      # Nightly schedule (2am UTC)
+      trigger:
+        periodic:
+          interval: 1
+          unit: DAYS
+
+      # Health monitoring - alert if eval takes > 2 hours
+      health:
+        rules:
+          - metric: RUN_DURATION_SECONDS
+            op: GREATER_THAN
+            value: 7200
+
+      email_notifications:
+        on_failure:
+          - apps-mcp-team@databricks.com
+
+      parameters:
+        - name: mlflow_experiment
+          default: ${var.mlflow_experiment}
+        - name: parallelism
+          default: ${var.eval_parallelism}
+        - name: evals_git_url
+          default: ${var.evals_git_url}
+        - name: apps_volume
+          default: ${var.apps_volume}
+
+      job_clusters:
+        - job_cluster_key: eval_cluster
+          new_cluster:
+            spark_version: "16.2.x-scala2.12"
+            node_type_id: "n2-standard-4"
+            num_workers: 0
+            data_security_mode: SINGLE_USER
+            spark_conf:
+              spark.databricks.cluster.profile: singleNode
+              spark.master: "local[*]"
+            custom_tags:
+              ResourceClass: SingleNode
+            spark_env_vars:
+              DATABRICKS_HOST: ${workspace.host}
+              DATABRICKS_TOKEN: "{{secrets/apps-mcp-evals/databricks-token}}"
+            init_scripts:
+              - workspace:
+                  destination: ${workspace.file_path}/init/setup_eval.sh
+
+      tasks:
+        - task_key: run_evals
+          job_cluster_key: eval_cluster
+          spark_python_task:
+            python_file: ${workspace.file_path}/src/run_evals.py
+            parameters:
+              - --mlflow-experiment
+              - ${var.mlflow_experiment}
+              - --parallelism
+              - ${var.eval_parallelism}
+              - --evals-git-url
+              - ${var.evals_git_url}
+              - --apps-volume
+              - ${var.apps_volume}
@@ -0,0 +1,39 @@
+resources:
+  jobs:
+    apps_generation_job:
+      name: "[${bundle.target}] Apps-MCP Generation"
+
+      job_clusters:
+        - job_cluster_key: generation_cluster
+          new_cluster:
+            spark_version: "16.2.x-scala2.12"
+            node_type_id: "n2-standard-8"
+            num_workers: 0
+            data_security_mode: SINGLE_USER
+            spark_conf:
+              spark.databricks.cluster.profile: singleNode
+              spark.master: "local[*]"
+            custom_tags:
+              ResourceClass: SingleNode
+            spark_env_vars:
+              ANTHROPIC_API_KEY: "{{secrets/apps-mcp-evals/anthropic-api-key}}"
+              DATABRICKS_HOST: ${workspace.host}
+              DATABRICKS_TOKEN: "{{secrets/apps-mcp-evals/databricks-token}}"
+            init_scripts:
+              - workspace:
+                  destination: ${workspace.file_path}/init/setup_generation.sh
+
+      tasks:
+        - task_key: generate_apps
+          job_cluster_key: generation_cluster
+          spark_python_task:
+            python_file: ${workspace.file_path}/src/generate_apps.py
+            parameters:
+              - --mcp-binary
+              - ${var.cli_binary_volume}/databricks-linux
+              - --output-volume
+              - ${var.apps_volume}
+              - --prompts
+              - ${var.prompts}
+              - --max-concurrency
+              - ${var.generation_parallelism}