AgentGuardHQ · jpleva91 · Apr 2, 2026 · Copilot · Apr 2, 2026 · Copilot
diff --git a/agents.yaml b/agents.yaml
@@ -5,6 +5,7 @@
 max_parallel: 0  # 0 = auto-detect from available RAM
 log_dir: outputs/logs
 model_ram_gb: 19 # qwen3:30b Q4_K_M — adjust for your model
+# inference: remote  # uncomment for remote Ollama (RunPod/GPU server) — disables RAM-based concurrency
 
 agents:
   - name: qa-agent

diff --git a/cmd/shellforge/main.go b/cmd/shellforge/main.go
@@ -25,7 +25,7 @@ import (
 "github.com/AgentGuardHQ/shellforge/internal/scheduler"
 )
 
-var version = "0.4.8"
+var version = "dev"
 
 func main() {
 if len(os.Args) < 2 {
@@ -147,13 +147,35 @@ total := 6
 // ── Detect environment ──
 isServer := !hasGPU() && runtime.GOOS == "linux"
 model := ""
+remoteOllamaHost := "" // set if user configures a remote Ollama endpoint
 
-// ── Step 1: Ollama (skip on headless server) ──
+// ── Step 1: Ollama (local install or remote endpoint) ──
 steps++
 if isServer {
-fmt.Printf("── Step %d/%d: Ollama (skipped — server mode) ──\n", steps, total)
-fmt.Println("  Detected: Linux, no GPU — skipping local model setup")
-fmt.Println("  Use CLI drivers instead: shellforge run claude, copilot, codex, gemini")
+fmt.Printf("── Step %d/%d: Ollama ──\n", steps, total)
+fmt.Println("  Detected: Linux, no GPU")
+fmt.Println("  Options:")
+fmt.Println("    1) Configure remote Ollama endpoint (OLLAMA_HOST)")
+fmt.Println("    2) Use API drivers only (Claude, Copilot, Codex, Gemini)")
+fmt.Print("  Pick [2]: ")
+serverChoice := readLine(reader)
+if strings.TrimSpace(serverChoice) == "1" {
+fmt.Print("  Remote Ollama URL [http://localhost:11434]: ")
+hostInput := strings.TrimSpace(readLine(reader))
+if hostInput == "" {
+hostInput = "http://localhost:11434"
+}
+remoteOllamaHost = hostInput
+fmt.Printf("  → OLLAMA_HOST=%s\n", remoteOllamaHost)
+if ollama.IsRunning() {
+fmt.Printf("  ✓ Ollama reachable at %s\n", remoteOllamaHost)
+} else {
+fmt.Printf("  ⚠ Ollama not reachable at %s — verify it is running\n", remoteOllamaHost)
+}
+fmt.Printf("  Tip: export OLLAMA_HOST=%s\n", remoteOllamaHost)
+} else {
+fmt.Println("  Skipping Ollama — use CLI drivers: shellforge run claude, copilot, codex, gemini")
+}
 fmt.Println()
 } else {
 fmt.Printf("── Step %d/%d: Ollama (local LLM inference) ──\n", steps, total)
@@ -296,10 +318,9 @@ fmt.Println()
 steps++
 fmt.Printf("── Step %d/%d: Agent drivers ──\n", steps, total)
 
-// On Mac/GPU: offer Goose (local models via Ollama). On server: skip, show API drivers.
-if !isServer {
+// Offer Goose on all platforms — it runs headlessly and works with remote Ollama.
 if _, err := exec.LookPath("goose"); err != nil {
-fmt.Println("  Goose — AI agent with native Ollama support (actually executes tools)")
+fmt.Println("  Goose — AI agent with native Ollama support (works headlessly)")
 fmt.Print("  Install Goose? [Y/n] ")
 if confirm(reader) {
 fmt.Println("  → Installing Goose...")
@@ -310,14 +331,17 @@ run("sh", "-c", "curl -fsSL https://github.com/block/goose/releases/download/sta
 }
 if _, err := exec.LookPath("goose"); err == nil {
 fmt.Println("  ✓ Goose installed")
+if remoteOllamaHost != "" {
+fmt.Printf("  → Run 'goose configure' and set Ollama host to %s\n", remoteOllamaHost)
+} else {
 fmt.Println("  → Run 'goose configure' to set up Ollama provider")
+}
 } else {
 fmt.Println("  ⚠ Install failed — try: brew install --cask block-goose")
 }
 }
 } else {
-fmt.Println("  ✓ Goose installed (local model driver)")
-}
+fmt.Println("  ✓ Goose installed")
 }
 
 // Show API-based drivers
@@ -393,14 +417,22 @@ fmt.Println("║     Setup Complete                   ║")
 fmt.Println("╚══════════════════════════════════════╝")
 fmt.Println()
 if isServer {
+if remoteOllamaHost != "" {
+fmt.Println("  Server mode — remote Ollama configured:")
+fmt.Printf("    export OLLAMA_HOST=%s\n", remoteOllamaHost)
+fmt.Println("    shellforge run goose \"describe this project\"")
+fmt.Println("    shellforge serve agents.yaml")
+fmt.Println()
+fmt.Println("  Or use CLI drivers:")
+} else {
 fmt.Println("  Server mode — use CLI drivers:")
+}
 fmt.Println("    shellforge run claude \"review open PRs\"")
 fmt.Println("    shellforge run copilot \"update docs\"")
 fmt.Println("    shellforge run codex \"generate tests\"")
 fmt.Println()
 fmt.Println("  Run a swarm:")
-fmt.Println("    shellforge swarm                      # start Dagu dashboard")
-fmt.Println("    dagu start dags/multi-driver-swarm.yaml")
+fmt.Println("    shellforge swarm                      # start Octi Pulpo dashboard")
-fmt.Println("    shellforge swarm                      # start Octi Pulpo dashboard")
+fmt.Println("    shellforge swarm                      # start Dagu dashboard")
-fmt.Println("    shellforge swarm                      # start Octi Pulpo dashboard")
+fmt.Println("    shellforge swarm                      # start Dagu dashboard")
 } else {
 fmt.Println("  Quick start:")
 fmt.Println("    shellforge run goose \"describe this project\"")
@@ -1060,7 +1092,12 @@ return eng
 
 func mustOllama() {
 if !ollama.IsRunning() {
+if strings.HasPrefix(ollama.Host, "http://localhost") || strings.HasPrefix(ollama.Host, "http://127.") {
 fmt.Fprintln(os.Stderr, "ERROR: Ollama not running. Start: ollama serve")
+} else {
+fmt.Fprintf(os.Stderr, "ERROR: Ollama not reachable at %s\n", ollama.Host)
+fmt.Fprintln(os.Stderr, "  Check that the remote Ollama server is running and accessible.")
+}
-fmt.Fprintln(os.Stderr, "  Check that the remote Ollama server is running and accessible.")
-}
+fmt.Fprintln(os.Stderr, "  Check that the remote Ollama server is running and accessible.")
+envHost := os.Getenv("OLLAMA_HOST")
+if envHost == "" {
+fmt.Fprintln(os.Stderr, "  Hint: Set OLLAMA_HOST to your remote Ollama endpoint (e.g. http://your-server:11434).")
+} else {
+fmt.Fprintf(os.Stderr, "  Hint: OLLAMA_HOST is currently set to %s — update it if this is not your remote endpoint.\n", envHost)
+}
+}
-fmt.Fprintln(os.Stderr, "  Check that the remote Ollama server is running and accessible.")
-}
+fmt.Fprintln(os.Stderr, "  Check that the remote Ollama server is running and accessible.")
+envHost := os.Getenv("OLLAMA_HOST")
+if envHost == "" {
+fmt.Fprintln(os.Stderr, "  Hint: Set OLLAMA_HOST to your remote Ollama endpoint (e.g. http://your-server:11434).")
+} else {
+fmt.Fprintf(os.Stderr, "  Hint: OLLAMA_HOST is currently set to %s — update it if this is not your remote endpoint.\n", envHost)
+}
+}
 os.Exit(1)
 }
 }

diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
@@ -35,6 +35,7 @@ type ServeConfig struct {
 	MaxParallel int           `yaml:"max_parallel"` // 0 = auto-detect
 	LogDir      string        `yaml:"log_dir"`
 	ModelRAM    int           `yaml:"model_ram_gb"` // estimated model RAM in GB, default 19
+	Inference   string        `yaml:"inference"`    // "remote" disables RAM-based concurrency detection
 	Agents      []AgentConfig `yaml:"agents"`
 }
 
@@ -132,7 +133,14 @@ func detectTotalRAM() uint64 {
 func New(cfg *ServeConfig, run RunFunc) *Scheduler {
 	maxP := cfg.MaxParallel
 	if maxP <= 0 {
-		maxP = DetectMaxParallel(cfg.ModelRAM)
+		if cfg.Inference == "remote" {
+			// Remote inference: GPU VRAM is the bottleneck, not local RAM.
+			// Default to 4 concurrent slots; user can override via max_parallel.
+			maxP = 4
+			fmt.Println("[scheduler] inference=remote — max_parallel=4 (GPU-bound, not RAM-bound)")
+		} else {
+			maxP = DetectMaxParallel(cfg.ModelRAM)
+		}
 	}
 	os.MkdirAll(cfg.LogDir, 0o755)
 	return &Scheduler{

diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go
@@ -0,0 +1,90 @@
+package scheduler
+
+import (
+	"os"
+	"testing"
+
+	"gopkg.in/yaml.v3"
+)
+
+func TestServeConfigInferenceField(t *testing.T) {
+	yamlInput := `
+max_parallel: 0
+log_dir: outputs/logs
+model_ram_gb: 8
+inference: remote
+agents: []
+`
+	var cfg ServeConfig
+	if err := yaml.Unmarshal([]byte(yamlInput), &cfg); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if cfg.Inference != "remote" {
+		t.Errorf("inference = %q, want %q", cfg.Inference, "remote")
+	}
+}
+
+func TestServeConfigInferenceDefaultsEmpty(t *testing.T) {
+	yamlInput := `
+max_parallel: 0
+log_dir: outputs/logs
+model_ram_gb: 8
+agents: []
+`
+	var cfg ServeConfig
+	if err := yaml.Unmarshal([]byte(yamlInput), &cfg); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if cfg.Inference != "" {
+		t.Errorf("inference = %q, want empty", cfg.Inference)
+	}
+}
+
+func TestNewSchedulerRemoteInferenceDefaultsToFour(t *testing.T) {
+	cfg := &ServeConfig{
+		MaxParallel: 0,
+		Inference:   "remote",
+		LogDir:      t.TempDir(),
+		ModelRAM:    8,
+	}
+	var noopRun RunFunc = func(name, system, prompt string, timeoutSec int) error { return nil }
+	sched := New(cfg, noopRun)
+	if cap(sched.slots) != 4 {
+		t.Errorf("remote inference: slots cap = %d, want 4", cap(sched.slots))
+	}
+}
+
+func TestNewSchedulerRemoteInferenceRespectsExplicitMaxParallel(t *testing.T) {
+	cfg := &ServeConfig{
+		MaxParallel: 8,
+		Inference:   "remote",
+		LogDir:      t.TempDir(),
+		ModelRAM:    8,
+	}
+	var noopRun RunFunc = func(name, system, prompt string, timeoutSec int) error { return nil }
+	sched := New(cfg, noopRun)
+	if cap(sched.slots) != 8 {
+		t.Errorf("remote inference with explicit max_parallel: slots cap = %d, want 8", cap(sched.slots))
+	}
+}
+
+func TestLoadConfigInferenceField(t *testing.T) {
+	content := `
+max_parallel: 0
+log_dir: outputs/logs
+model_ram_gb: 8
+inference: remote
+agents: []
+`
+	path := t.TempDir() + "/agents.yaml"
+	if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+		t.Fatalf("write temp config: %v", err)
+	}
+	cfg, err := LoadConfig(path)
+	if err != nil {
+		t.Fatalf("LoadConfig: %v", err)
+	}
+	if cfg.Inference != "remote" {
+		t.Errorf("loaded inference = %q, want %q", cfg.Inference, "remote")
+	}
+}