Infini-AI-Lab · haizhongzheng · Jun 5, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
@@ -12,6 +12,7 @@ data-data
 data-log
 tmp-yaml/
 issue-draft/
+claude-doc/
 .pip-tmp/
 *.pid
 *.pdf
@@ -230,3 +231,9 @@ evaluation/data/AReaL-boba-2-RL-Code
 tmp*
 torchelastic_*
 torchinductor_*
+
+# Oolong HF dataset cache (auto-downloaded, multi-GB)
+astraflow/core/workflow/impl/oolong/oolong_*.jsonl
+
+# DeepDive HF dataset cache (auto-downloaded)
+astraflow/core/workflow/impl/deepdive/deepdive_*.jsonl
@@ -43,6 +43,8 @@ AstraFlow **natively** supports the following for LLM RL training **without any
 <!-- <p align="center"><i>AstraFlow training a multi-policy workflow on an elastic, heterogeneous, cross-region rollout pool — all at once, with no feature-specific code.</i></p> -->
 
 ## News
+- **[2026/06]** New recipe: **dynamic recursive agent** on TextCraft — a multi-turn agent that recursively spawns sub-agents sharing inventory under a team reward. See the [recipe docs](https://Infini-AI-Lab.github.io/astraflow/docs/en/recipes/textcraft-recursive.html).
+- **[2026/06]** AstraFlow **v0.1.1** released — CUDA 13 image, SGLang 0.5.12, Megatron weight-sync training backend, and transformers 5 support. See the [project website](https://Infini-AI-Lab.github.io/astraflow/).
 - **[2026/05]** AstraFlow **v0.1.0** released — first public release of the full system. See the [project website](https://Infini-AI-Lab.github.io/astraflow/).
 - **[2026/05]** AstraFlow paper is on [arXiv](https://arxiv.org/abs/2605.15565).
 

@@ -0,0 +1,172 @@
+"""Single-call auto-checklist grader — local replacement for
+``ai-rubric``'s ``rubric.core.checklist.RubricChecklistFast``.
+
+Matches the upstream package's behavior:
+
+  - **One LLM call** (not two) — the model generates the checklist and
+    scores every item in a single response.
+  - **Continuous per-item scores** (0-1, not binary pass/fail) so the LLM
+    can reflect partial satisfaction.
+  - **Holistic ``overall_score``** chosen by the LLM, not mechanical
+    ``passed / total`` (lets critical items dominate non-critical ones).
+  - **No caching** — fresh checklist every call (matches upstream).
+
+System prompt ported verbatim from
+``ai_rubric-0.2.4/rubric/prompts/generate-rubric-checklist-fast-system.jinja``.
+
+Usage::
+
+    from astraEnv.checklist import ChecklistGrader
+
+    grader = ChecklistGrader(goal="find the actor's birth year")
+    score, reason = await grader.aevaluate(context=trajectory_text)
+
+Uses ``astraEnv.judge.judge`` for the LLM call so we keep our retry / key
+handling. Temperature defaults to 1.0 for parity with the upstream package.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from astraEnv.judge import extract_json, judge
+
+
+# Verbatim from ai-rubric 0.2.4:
+# rubric/prompts/generate-rubric-checklist-fast-system.jinja
+_SYSTEM_PROMPT = (
+    "We are building a rubric to evaluate a task. We will do this by "
+    "decomposing success criteria for the task into a checklist\n"
+    "and reasoning about the task success using this checklist. The "
+    "checklist should comprehensively test that the task is successfully "
+    "completed.\n\n"
+    "The rubric checklist should be as comprehensive as possible, and "
+    "should be able to evaluate the task in a way that is fair and accurate.\n\n"
+    "The rubric checklist should be as concise as possible, and should be "
+    "able to be easily understood by a human.\n\n"
+    "The rubric checklist should be as easy to evaluate as possible.\n\n"
+    "To evaluate a task on a checklist, you may consider the following "
+    "procedure:\n"
+    "1. For each criterion, reason whether it is critical or non-critical.\n"
+    "2. For each criterion, provide a score between 0 and 1 for how well "
+    "the task satisfies the criterion.\n"
+    "3. Consider the overall progress towards task completion and allow "
+    "for partial credit when generating the overall score.\n\n"
+    "# Output Format\n"
+    "```json\n"
+    "{\n"
+    '    "checklist": [\n'
+    '        "...", // a list of strings\n'
+    "    ],\n"
+    '    "checklist_scores": [\n'
+    "        0.0, // between 0 and 1\n"
+    "    ],\n"
+    '    "reasoning": "...",\n'
+    '    "overall_score": 0.0 // between 0 and 1\n'
+    "}\n"
+    "```"
+)
+
+
+def _build_user_prompt(task: str, context: str) -> str:
+    """Mirrors generate-rubric-checklist-fast-user.jinja."""
+    return f"# Task\n{task}\n\n{context}\n\n# Your Evaluation Output"
+
+
+class ChecklistGrader:
+    """Single-call checklist grader matching ai-rubric's RubricChecklistFast.
+
+    Parameters
+    ----------
+    goal : str
+        The task goal the agent was given.
+    judge_model : str | None
+        Optional override for the judge model. None = astraEnv.judge default.
+    temperature : float
+        Sampling temperature. 1.0 matches the upstream package's default.
+    """
+
+    def __init__(
+        self,
+        goal: str,
+        *,
+        judge_model: str | None = None,
+        temperature: float = 1.0,
+    ):
+        self.goal = goal
+        self.judge_model = judge_model
+        self.temperature = temperature
+        # Most-recent parsed response — exposed for inspection / debugging.
+        self.last_checklist: list[str] = []
+        self.last_checklist_scores: list[float] = []
+        self.last_reasoning: str = ""
+        self.last_overall_score: float | None = None
+
+    def _judge_kwargs(self) -> dict[str, Any]:
+        kw: dict[str, Any] = {"temperature": self.temperature}
+        if self.judge_model:
+            kw["model"] = self.judge_model
+        return kw
+
+    async def aevaluate(self, *, context: str) -> tuple[float, str]:
+        """Run one LLM call that generates+scores the checklist.
+
+        Returns
+        -------
+        score : float in [0, 1]
+            The LLM's holistic ``overall_score``.
+        reason : str
+            The LLM's reasoning. Empty string on failure.
+
+        On any failure (network, parse, out-of-range score) returns
+        ``(0.0, error_message)`` — never raises.
+        """
+        user = _build_user_prompt(self.goal, context)
+        try:
+            raw = await judge(
+                system=_SYSTEM_PROMPT, user=user, **self._judge_kwargs()
+            )
+        except Exception as e:
+            return 0.0, f"checklist call failed: {e}"
+
+        try:
+            parsed = extract_json(raw)
+        except Exception as e:
+            return 0.0, f"checklist response unparseable: {e}"
+
+        try:
+            overall = float(parsed.get("overall_score", 0.0))
+        except (TypeError, ValueError) as e:
+            return 0.0, f"overall_score not a number: {e}"
+
+        # Clamp defensively; the upstream package raises if out of [0,1],
+        # but we prefer to log and continue so a flaky judge response
+        # never crashes the rollout.
+        overall = max(0.0, min(1.0, overall))
+
+        # Stash for inspection.
+        checklist = parsed.get("checklist") or []
+        scores = parsed.get("checklist_scores") or []
+        self.last_checklist = [str(x) for x in checklist if isinstance(x, (str, int, float))]
+        self.last_checklist_scores = []
+        for s in scores:
+            try:
+                self.last_checklist_scores.append(float(s))
+            except (TypeError, ValueError):
+                continue
+        self.last_reasoning = str(parsed.get("reasoning", ""))
+        self.last_overall_score = overall
+
+        return overall, self.last_reasoning
+
+
+async def grade_with_checklist(
+    goal: str,
+    context: str,
+    *,
+    judge_model: str | None = None,
+    temperature: float = 1.0,
+) -> tuple[float, str]:
+    """Convenience wrapper: build a grader and evaluate in one call."""
+    grader = ChecklistGrader(goal, judge_model=judge_model, temperature=temperature)
+    return await grader.aevaluate(context=context)
@@ -0,0 +1,151 @@
+"""Minimal LLM-as-a-judge utility.
+
+Two functions. Both stateless.
+
+- `judge(system, user, ...)` posts a (system, user) pair to Fireworks and
+  returns the raw assistant content string.
+- `extract_json(text)` parses JSON out of an LLM response, tolerating
+  common code-fence wrapping.
+
+Callers write their own rubric prompts and parse what they expect.
+See claude-doc/minimal-llm-judge-plan.md for the design rationale.
+
+Usage:
+    from astraEnv.judge import judge, extract_json
+
+    response = await judge(
+        system='You grade outputs. Return JSON {"score", "reason"}.',
+        user=f"Goal: {goal}\\n\\nOutput: {output}",
+    )
+    parsed = extract_json(response)
+    score = float(parsed["score"])
+
+Requires the env var `FIREWORKS_API_KEY`.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import re
+from typing import Any
+
+import httpx
+
+_API_URL = "https://api.fireworks.ai/inference/v1/chat/completions"
+_DEFAULT_MODEL = "accounts/fireworks/models/gpt-oss-120b"
+_RETRY_STATUSES = {429, 500, 502, 503, 504}
+_MAX_ATTEMPTS = 3
+
+
+class JudgeError(RuntimeError):
+    """Raised when the judge call cannot return a usable response."""
+
+
+async def judge(
+    system: str,
+    user: str,
+    *,
+    model: str = _DEFAULT_MODEL,
+    temperature: float = 0.0,
+    max_tokens: int = 2048,
+    timeout_s: float = 60.0,
+) -> str:
+    """Send (system, user) to Fireworks; return the raw assistant content.
+
+    Retries up to 3 times with exponential backoff on transient failures
+    (429, 5xx, network errors). Raises JudgeError on persistent failure.
+
+    Default `max_tokens` is set generously (2048) because reasoning models
+    like gpt-oss-120b consume tokens for internal chain-of-thought before
+    emitting the final answer; too-tight budgets truncate before content.
+
+    For reasoning models that put their chain-of-thought into a separate
+    `reasoning_content` field, this function returns `content` if non-empty,
+    otherwise falls back to `reasoning_content`. extract_json() handles
+    both shapes.
+    """
+    api_key = os.environ.get("FIREWORKS_API_KEY")
+    if not api_key:
+        raise JudgeError("FIREWORKS_API_KEY environment variable is not set")
+
+    payload = {
+        "model": model,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system},
+            {"role": "user", "content": user},
+        ],
+    }
+    headers = {"Authorization": f"Bearer {api_key}"}
+
+    last_err: Exception | None = None
+    async with httpx.AsyncClient(timeout=timeout_s) as client:
+        for attempt in range(_MAX_ATTEMPTS):
+            try:
+                resp = await client.post(_API_URL, json=payload, headers=headers)
+            except httpx.RequestError as exc:
+                last_err = exc
+                await asyncio.sleep(2**attempt)
+                continue
+
+            if resp.status_code == 200:
+                try:
+                    message = resp.json()["choices"][0]["message"]
+                except (KeyError, IndexError, ValueError) as exc:
+                    raise JudgeError(
+                        f"Unexpected response shape: {resp.text[:500]}"
+                    ) from exc
+                # Prefer the canonical `content` field. Reasoning models
+                # (e.g. gpt-oss-120b) may emit only `reasoning_content`
+                # when truncated; fall back to that so extract_json can
+                # still find a JSON snippet inside the chain-of-thought.
+                content = message.get("content") or message.get("reasoning_content")
+                if not content:
+                    raise JudgeError(
+                        f"Empty assistant content: {resp.text[:500]}"
+                    )
+                return content
+
+            if resp.status_code in _RETRY_STATUSES:
+                last_err = JudgeError(
+                    f"Fireworks returned {resp.status_code}: {resp.text[:200]}"
+                )
+                await asyncio.sleep(2**attempt)
+                continue
+
+            raise JudgeError(
+                f"Fireworks returned {resp.status_code}: {resp.text[:500]}"
+            )
+
+    raise JudgeError(
+        f"judge() failed after {_MAX_ATTEMPTS} attempts: {last_err}"
+    ) from last_err
+
+
+def extract_json(text: str) -> dict[str, Any]:
+    """Parse JSON out of an LLM response, tolerating common fence wrapping.
+
+    Strategy (first success wins):
+      1. json.loads on the trimmed text
+      2. strip ```json ... ``` fences and retry
+      3. strip plain ``` ... ``` fences and retry
+      4. re-raise the original JSONDecodeError
+    """
+    text = text.strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+
+    fenced = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL | re.IGNORECASE)
+    if fenced:
+        return json.loads(fenced.group(1).strip())
+
+    fenced = re.search(r"```\s*(.*?)\s*```", text, re.DOTALL)
+    if fenced:
+        return json.loads(fenced.group(1).strip())
+
+    return json.loads(text)