FrontierCS · joyemang33 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/2.0/problems/vector_db_ann/config.yaml b/2.0/problems/vector_db_ann/config.yaml
@@ -19,8 +19,11 @@ runtime:
   docker:
     image: ubuntu:24.04
 environment:
-  cpus: 4
-  memory_mb: 8192
+  # If these resource limits change, also update the resource budget text in
+  # readme and harbor/app/README.md so agents can design parallel algorithms
+  # for the actual CPU and memory budget.
+  cpus: 8
+  memory_mb: 16384
   storage_mb: 8192
   build_timeout_seconds: 3600
 submission:

diff --git a/2.0/problems/vector_db_ann/harbor/app/README.md b/2.0/problems/vector_db_ann/harbor/app/README.md
@@ -20,6 +20,14 @@ cargo 1.75
 Pin crate versions if newer transitive dependencies require a newer Rust
 compiler.
 
+The Harbor task provides the following resource budget:
+
+```text
+vCPUs: 8
+memory: 16 GiB
+query concurrency: 4
+```
+
 ## Attribution
 
 This starter skeleton is adapted from KCORES/vector-db-bench, licensed under

diff --git a/2.0/problems/vector_db_ann/readme b/2.0/problems/vector_db_ann/readme
@@ -36,6 +36,15 @@ cargo 1.75
 If you add crates, choose versions compatible with this toolchain or pin
 transitive dependencies accordingly.
 
+The service and judge run with the task resource limits below. Design your
+parallel search and indexing strategy for this budget:
+
+```text
+vCPUs: 8
+memory: 16 GiB
+query concurrency: 4
+```
+
 The service must listen on `PORT` and implement these endpoints:
 
 ```text
@@ -92,15 +101,21 @@ A submission is valid if:
 
 ## Scoring
 
-The evaluator runs an official exact-search reference HTTP service once per
-judge process on the same hidden benchmark and through the same `/bulk_insert`
-and `/search` client harnesses to measure:
+At trial startup, the Harbor judge sidecar prepares the hidden benchmark and
+runs an exact-search reference HTTP service through the same `/bulk_insert`
+and `/search` client harnesses to produce ground-truth nearest neighbors and
+the trial-local scoring baseline:
 
 ```text
 baseline_qps
+baseline_effective_qps
 baseline_load_seconds
 ```
 
+Interactive submissions and the final verifier both score through this same
+judge sidecar, so the baseline and runtime environment are shared within a
+trial while still letting different machines measure their own local baseline.
+
 Each submission is then timed independently. The load phase includes all
 `/bulk_insert` calls and any index construction performed by the service before
 queries begin. The query phase measures only `/search` throughput:

diff --git a/adapters/frontier-cs-2.0/README.md b/adapters/frontier-cs-2.0/README.md
@@ -48,9 +48,11 @@ uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-demo
 
 ## Task Contract
 
-The agent works in `/app` and must create `/app/solution.py`. The final
-verifier runs the original Frontier-CS `2.0` evaluator and writes a normalized
-reward in `/logs/verifier/reward.txt`.
+The agent works in `/app` and must create `/app/solution.py` unless the task
+declares a directory submission. A judge sidecar prepares the task evaluator
+once per trial; both iterative submissions and the final verifier score
+through that same sidecar. The final verifier writes a normalized reward in
+`/logs/verifier/reward.txt`.
 
 During the trial, the agent can call:
 
@@ -61,11 +63,12 @@ bash /app/submit.sh
 This submits the current `/app/solution.py` to a black-box judge service,
 prints the score and feedback, and records each attempt in
 `/logs/agent/submissions.jsonl`. The evaluator source is not mounted into the
-agent workspace. The final verifier mirrors that log to
-`/logs/verifier/submissions.jsonl` for process-reward analysis. The reported
-reward is the maximum of the final `/app/solution.py` score and the best
-successful iterative submission, so a timed-out agent can keep its best
-submitted solution.
+agent workspace. The judge owns the authoritative submission log at
+`/logs/judge/submissions.jsonl`; the final verifier filters iterative agent
+submissions into `/logs/verifier/submissions.jsonl` for process-reward
+analysis. The reported reward is the maximum of the final submission score and
+the best successful iterative submission, so a timed-out agent can keep its
+best submitted solution.
 
 Some Harbor CLI versions print the timeout/error summary before rewards; in
 that case inspect `result.json`, `verifier/reward.json`, and

diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py
@@ -233,8 +233,24 @@ def _write_environment(self, task_paths: "TaskPaths", problem: FrontierCS20Probl
             ),
             encoding="utf-8",
         )
-        for name in ("docker-compose.yaml", "judge_server.py", "submit.py"):
-            shutil.copy2(self.template_dir / "environment" / name, env_dir / name)
+        environment = problem.config.get("environment", {}) or {}
+        compose = (
+            self.template_dir / "environment" / "docker-compose.yaml"
+        ).read_text(encoding="utf-8")
+        (env_dir / "docker-compose.yaml").write_text(
+            compose.format(
+                judge_cpus=int(environment.get("cpus", 2)),
+                judge_memory_mb=int(environment.get("memory_mb", 4096)),
+            ),
+            encoding="utf-8",
+        )
+        shutil.copy2(
+            self.template_dir / "environment" / "judge_server.py",
+            env_dir / "judge_server.py",
+        )
+        shutil.copy2(
+            self.template_dir / "environment" / "submit.py", env_dir / "submit.py"
+        )
         # Kept in the build context for the judge image only; the main agent
         # image's Dockerfile does not copy this into /app.
         shutil.copy2(problem.problem_dir / "evaluator.py", env_dir / "problem_evaluator.py")
@@ -254,7 +270,9 @@ def _write_submission_config(self, env_dir: Path, problem: FrontierCS20Problem)
     def _write_tests(self, task_paths: "TaskPaths", problem: FrontierCS20Problem) -> None:
         tests_dir = task_paths.tests_dir
         shutil.copy2(self.template_dir / "tests" / "test.sh", tests_dir / "test.sh")
-        shutil.copy2(self.template_dir / "tests" / "evaluate.py", tests_dir / "evaluate.py")
+        shutil.copy2(
+            self.template_dir / "tests" / "evaluate.py", tests_dir / "evaluate.py"
+        )
         shutil.copy2(problem.problem_dir / "evaluator.py", tests_dir / "problem_evaluator.py")
         (tests_dir / "test.sh").chmod(0o755)
 

diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml
@@ -8,8 +8,16 @@ services:
 
   judge:
     build:
-      context: ${CONTEXT_DIR}
+      context: ${{CONTEXT_DIR}}
       dockerfile: Dockerfile.judge
+    deploy:
+      resources:
+        limits:
+          cpus: "{judge_cpus}"
+          memory: "{judge_memory_mb}M"
+        reservations:
+          cpus: "{judge_cpus}"
+          memory: "{judge_memory_mb}M"
     command: ["python3", "/judge/judge_server.py"]
     expose:
       - "8082"

diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
@@ -14,11 +14,14 @@
 import time
 import traceback
 import threading
+from datetime import datetime, timezone
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from pathlib import Path
 from typing import Any
 
 PROBLEM_EVALUATOR_PATH = Path("/judge/problem_evaluator.py")
+JUDGE_READY_LOG = Path("/logs/judge/judge_ready.json")
+JUDGE_SUBMISSIONS_LOG = Path("/logs/judge/submissions.jsonl")
 MAX_SUBMISSION_BYTES = 30_000_000
 MAX_ARCHIVE_BYTES = 20_000_000
 
@@ -40,6 +43,44 @@ def load_problem_evaluator():
 READY_PAYLOAD: dict[str, Any] = {"status": "starting"}
 
 
+def now_iso() -> str:
+    return (
+        datetime.now(timezone.utc)
+        .isoformat(timespec="milliseconds")
+        .replace("+00:00", "Z")
+    )
+
+
+def write_judge_ready(payload: dict[str, Any]) -> None:
+    try:
+        JUDGE_READY_LOG.parent.mkdir(parents=True, exist_ok=True)
+        JUDGE_READY_LOG.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    except OSError as exc:
+        print(f"WARN: failed to write judge_ready.json: {exc}", flush=True)
+
+
+def log_submission(record: dict[str, Any]) -> None:
+    JUDGE_SUBMISSIONS_LOG.parent.mkdir(parents=True, exist_ok=True)
+    with JUDGE_SUBMISSIONS_LOG.open("a", encoding="utf-8") as f:
+        f.write(json.dumps({"ts": now_iso(), **record}, ensure_ascii=False) + "\n")
+
+
+def read_submissions() -> list[dict[str, Any]]:
+    if not JUDGE_SUBMISSIONS_LOG.exists():
+        return []
+    records: list[dict[str, Any]] = []
+    for line in JUDGE_SUBMISSIONS_LOG.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        try:
+            record = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if isinstance(record, dict):
+            records.append(record)
+    return records
+
+
 def prepare_evaluator() -> None:
     global EVALUATOR, READY, READY_PAYLOAD
     start = time.time()
@@ -60,6 +101,7 @@ def prepare_evaluator() -> None:
             "prepare_seconds": elapsed,
             **payload,
         }
+        write_judge_ready(READY_PAYLOAD)
         READY = True
         print(
             "[frontier judge] ready "
@@ -144,6 +186,9 @@ def do_GET(self) -> None:
         if self.path == "/health":
             self._write_json(200 if READY else 503, READY_PAYLOAD)
             return
+        if self.path == "/submissions":
+            self._write_json(200, {"status": "ok", "submissions": read_submissions()})
+            return
         self._write_json(404, {"status": "error", "error": "not found"})
 
     def do_POST(self) -> None:
@@ -176,34 +221,63 @@ def do_POST(self) -> None:
             self._write_json(413, {"status": "error", "error": "submission too large"})
             return
 
+        submission_uuid = ""
+        submission_role = "agent"
+        submission_kind = "file"
         try:
             payload = json.loads(self.rfile.read(content_length).decode("utf-8"))
+            submission_uuid = str(payload.get("submission_uuid") or "")
+            submission_role = str(payload.get("submission_role") or "agent")
             submission_kind = payload.get("submission_kind", "file")
             if submission_kind == "directory":
                 archive_b64 = payload.get("archive_b64")
                 if not isinstance(archive_b64, str) or not archive_b64:
                     raise ValueError(
                         "directory submission must include archive_b64"
                     )
-                self._write_json(200, evaluate_archive(archive_b64))
+                result = evaluate_archive(archive_b64)
+                log_submission(
+                    {
+                        "submission_uuid": submission_uuid,
+                        "submission_role": submission_role,
+                        "submission_kind": submission_kind,
+                        **result,
+                    }
+                )
+                self._write_json(200, result)
                 return
             code = payload.get("code")
             if not isinstance(code, str) or not code.strip():
                 raise ValueError(
                     "file submission must include non-empty string field 'code'"
                 )
-            self._write_json(200, evaluate_code(code))
+            result = evaluate_code(code)
+            log_submission(
+                {
+                    "submission_uuid": submission_uuid,
+                    "submission_role": submission_role,
+                    "submission_kind": submission_kind,
+                    **result,
+                }
+            )
+            self._write_json(200, result)
         except Exception:
             print(traceback.format_exc(), flush=True)
-            self._write_json(
-                200,
+            result = {
+                "status": "error",
+                "score": 0.0,
+                "score_unbounded": 0.0,
+                "message": "evaluation failed",
+            }
+            log_submission(
                 {
-                    "status": "error",
-                    "score": 0.0,
-                    "score_unbounded": 0.0,
-                    "message": "evaluation failed",
-                },
+                    "submission_uuid": submission_uuid,
+                    "submission_role": submission_role,
+                    "submission_kind": submission_kind,
+                    **result,
+                }
             )
+            self._write_json(200, result)
 
     def log_message(self, fmt: str, *args: object) -> None:
         return

diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py
@@ -106,7 +106,7 @@ def make_directory_archive(root: Path, exclude: list[str]) -> tuple[str, int]:
     return base64.b64encode(buf.getvalue()).decode("ascii"), file_count
 
 
-def evaluate_with_judge(payload: dict) -> tuple[float, float, str, dict]:
+def evaluate_with_judge(payload: dict) -> dict:
     wait_for_judge()
     response = requests.post(
         f"{JUDGE_URL}/evaluate",
@@ -117,12 +117,7 @@ def evaluate_with_judge(payload: dict) -> tuple[float, float, str, dict]:
     payload = response.json()
     if payload.get("status") != "done":
         raise RuntimeError(str(payload.get("message") or payload.get("error") or payload))
-    return (
-        float(payload.get("score", 0.0)),
-        float(payload.get("score_unbounded", payload.get("score", 0.0))),
-        str(payload.get("message", "")),
-        dict(payload.get("metrics", {}) or {}),
-    )
+    return payload
 
 
 def main() -> int:
@@ -181,6 +176,7 @@ def main() -> int:
         )
         judge_payload = {
             "submission_kind": "directory",
+            "submission_uuid": sub_uuid,
             "archive_b64": archive_b64,
         }
     else:
@@ -199,11 +195,19 @@ def main() -> int:
                 }
             )
             return 2
-        judge_payload = {"submission_kind": "file", "code": code}
+        judge_payload = {
+            "submission_kind": "file",
+            "submission_uuid": sub_uuid,
+            "code": code,
+        }
 
     try:
         start = time.time()
-        score, score_unbounded, message, metrics = evaluate_with_judge(judge_payload)
+        judge_result = evaluate_with_judge(judge_payload)
+        score = float(judge_result.get("score", 0.0))
+        score_unbounded = float(judge_result.get("score_unbounded", score))
+        message = str(judge_result.get("message", ""))
+        metrics = dict(judge_result.get("metrics", {}) or {})
         elapsed_seconds = time.time() - start
         reward = float(score) / 100.0