From fa78ec8d02028f6f8413a67cfcbd52e398769cec Mon Sep 17 00:00:00 2001 From: VibeCodingScientist Date: Mon, 16 Feb 2026 12:45:05 +0100 Subject: [PATCH 1/2] feat: expand verification engine with 8 new capabilities Add cross-cutting meta-verifiers, new domain adapters, and enhanced existing adapters to significantly broaden verification coverage. Cross-cutting verifiers (apply to any domain): - Citation and Reference: DOI resolution, OpenAlex/Semantic Scholar metadata match, claim support via Jaccard similarity, freshness - Statistical Forensics: GRIM test, SPRITE test, Benfords law, p-curve analysis for detecting fabricated statistics - Reproducibility Executor: git clone, dependency detection, Docker sandbox execution, output comparison against claimed results - Data Integrity: schema validation, duplicate detection, z-score outlier flagging, SHA-256 hash verification New domain adapters: - Chemistry: rdkit SMILES validation, stoichiometry balancing, PubChem + ChEMBL cross-reference, retrosynthesis route checks - Physics: conservation law checks, stability/divergence detection, convergence analysis, dimensional analysis (pint), symbolic math (sympy) Enhanced existing adapters: - Math multi-prover: Coq and Isabelle support alongside Lean 4 - ML live inference: benchmark_live claim type runs models in Docker sandbox against HuggingFace benchmarks Infrastructure: - Redis-backed async verification queue with distributed semaphores - Cross-cutting runner with weighted score merging (70/30 domain/CC) - 4 new Docker images (Coq, Isabelle, reproducibility, ML inference) - New dependencies: rdkit-pypi, pint, sympy - Comprehensive test suite (10 new test files, ~1600 lines of tests) Co-Authored-By: Claude Opus 4.6 --- Dockerfile | 1 + .../versions/010_verification_status.py | 40 ++ backend/main.py | 7 + backend/models.py | 5 + backend/payloads/task_payloads.py | 34 +- backend/requirements.txt | 3 + backend/routes/tasks.py | 98 ++- backend/routes/verification.py | 45 ++ backend/schemas.py | 46 +- backend/services/verification_queue.py | 424 ++++++++++++ backend/verification/chemistry_adapter.py | 602 ++++++++++++++++++ backend/verification/citation_verifier.py | 323 ++++++++++ backend/verification/containers/build.sh | 105 +++ .../containers/compbio.Dockerfile | 37 +- .../verification/containers/coq.Dockerfile | 9 + .../containers/isabelle.Dockerfile | 11 + .../containers/lean4-mathlib.Dockerfile | 63 +- .../containers/ml-inference.Dockerfile | 11 + .../containers/reproducibility.Dockerfile | 14 + backend/verification/cross_cutting_base.py | 37 ++ backend/verification/cross_cutting_runner.py | 180 ++++++ backend/verification/data_integrity.py | 308 +++++++++ backend/verification/dispatcher.py | 14 +- backend/verification/lean4_adapter.py | 166 +++++ backend/verification/ml_repro_adapter.py | 252 ++++++++ backend/verification/physics_adapter.py | 600 +++++++++++++++++ .../verification/reproducibility_executor.py | 317 +++++++++ backend/verification/statistical_forensics.py | 437 +++++++++++++ docker-compose.prod.yml | 2 + docker-compose.yml | 2 + .../test_chemistry_adapter.py | 173 +++++ .../test_citation_verifier.py | 145 +++++ .../test_cross_cutting_base.py | 88 +++ .../test_cross_cutting_runner.py | 214 +++++++ .../test_verification/test_data_integrity.py | 183 ++++++ tests/test_verification/test_dispatcher.py | 4 +- .../test_ml_live_inference.py | 135 ++++ tests/test_verification/test_payloads.py | 115 ++++ .../test_verification/test_physics_adapter.py | 266 ++++++++ .../test_reproducibility_executor.py | 148 +++++ .../test_statistical_forensics.py | 152 +++++ 41 files changed, 5792 insertions(+), 24 deletions(-) create mode 100644 backend/alembic/versions/010_verification_status.py create mode 100644 backend/routes/verification.py create mode 100644 backend/services/verification_queue.py create mode 100644 backend/verification/chemistry_adapter.py create mode 100644 backend/verification/citation_verifier.py create mode 100755 backend/verification/containers/build.sh create mode 100644 backend/verification/containers/coq.Dockerfile create mode 100644 backend/verification/containers/isabelle.Dockerfile create mode 100644 backend/verification/containers/ml-inference.Dockerfile create mode 100644 backend/verification/containers/reproducibility.Dockerfile create mode 100644 backend/verification/cross_cutting_base.py create mode 100644 backend/verification/cross_cutting_runner.py create mode 100644 backend/verification/data_integrity.py create mode 100644 backend/verification/physics_adapter.py create mode 100644 backend/verification/reproducibility_executor.py create mode 100644 backend/verification/statistical_forensics.py create mode 100644 tests/test_verification/test_chemistry_adapter.py create mode 100644 tests/test_verification/test_citation_verifier.py create mode 100644 tests/test_verification/test_cross_cutting_base.py create mode 100644 tests/test_verification/test_cross_cutting_runner.py create mode 100644 tests/test_verification/test_data_integrity.py create mode 100644 tests/test_verification/test_ml_live_inference.py create mode 100644 tests/test_verification/test_physics_adapter.py create mode 100644 tests/test_verification/test_reproducibility_executor.py create mode 100644 tests/test_verification/test_statistical_forensics.py diff --git a/Dockerfile b/Dockerfile index 8a83bfe..fdd88c0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ RUN apt-get update && apt-get install -y --no-install-recommends \ curl \ libpq-dev \ + docker.io \ && rm -rf /var/lib/apt/lists/* RUN groupadd --gid 1000 appgroup \ diff --git a/backend/alembic/versions/010_verification_status.py b/backend/alembic/versions/010_verification_status.py new file mode 100644 index 0000000..34d9334 --- /dev/null +++ b/backend/alembic/versions/010_verification_status.py @@ -0,0 +1,40 @@ +"""Add verification status tracking columns to tasks. + +Revision ID: 010 +Revises: 009 +Create Date: 2026-02-16 +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision: str = "010" +down_revision: Union[str, None] = "009" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column("tasks", sa.Column("verification_status", sa.Text(), nullable=True)) + op.add_column("tasks", sa.Column("verification_job_id", sa.Text(), nullable=True)) + op.add_column("tasks", sa.Column("verification_queued_at", sa.DateTime(timezone=True), nullable=True)) + op.add_column("tasks", sa.Column("verification_started_at", sa.DateTime(timezone=True), nullable=True)) + op.add_column("tasks", sa.Column("verification_completed_at", sa.DateTime(timezone=True), nullable=True)) + + op.create_index( + "idx_tasks_verification_status", + "tasks", + ["verification_status"], + postgresql_where=sa.text("verification_status IS NOT NULL"), + ) + + +def downgrade() -> None: + op.drop_index("idx_tasks_verification_status", table_name="tasks") + op.drop_column("tasks", "verification_completed_at") + op.drop_column("tasks", "verification_started_at") + op.drop_column("tasks", "verification_queued_at") + op.drop_column("tasks", "verification_job_id") + op.drop_column("tasks", "verification_status") diff --git a/backend/main.py b/backend/main.py index 0b8019a..d0c75ac 100644 --- a/backend/main.py +++ b/backend/main.py @@ -45,11 +45,16 @@ async def lifespan(app: FastAPI): scheduler_task = asyncio.create_task(scheduler_loop(scheduler_stop)) logger.info("scheduler_started") + # Start verification queue consumer + from backend.services.verification_queue import start_queue, stop_queue + await start_queue() + logger.info("application_started") yield # Shutdown logger.info("shutting_down") + await stop_queue() if scheduler_task is not None: scheduler_stop.set() scheduler_task.cancel() @@ -104,6 +109,7 @@ async def lifespan(app: FastAPI): from backend.routes.lifecycle import router as lifecycle_router # noqa: E402 from backend.routes.notifications import router as notifications_router # noqa: E402 from backend.routes.lab_state import router as lab_state_router # noqa: E402 +from backend.routes.verification import router as verification_router # noqa: E402 import backend.verification.dispatcher # noqa: F401,E402 @@ -125,6 +131,7 @@ async def lifespan(app: FastAPI): app.include_router(lifecycle_router) app.include_router(notifications_router) app.include_router(lab_state_router) +app.include_router(verification_router) @app.get("/health") diff --git a/backend/models.py b/backend/models.py index e476533..9a2b2e4 100644 --- a/backend/models.py +++ b/backend/models.py @@ -518,6 +518,11 @@ class Task(Base): verification_score: Mapped[float | None] = mapped_column(DECIMAL(5, 4)) verification_badge: Mapped[str | None] = mapped_column(Text) verification_result: Mapped[dict | None] = mapped_column(JSONB) + verification_status: Mapped[str | None] = mapped_column(Text) + verification_job_id: Mapped[str | None] = mapped_column(Text) + verification_queued_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + verification_started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + verification_completed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) # Timestamps created_at: Mapped[datetime] = mapped_column( diff --git a/backend/payloads/task_payloads.py b/backend/payloads/task_payloads.py index b4494bb..9ca3644 100644 --- a/backend/payloads/task_payloads.py +++ b/backend/payloads/task_payloads.py @@ -68,14 +68,16 @@ class DeepResearchResult(BaseModel): class MathematicsPayload(BaseModel): """Extra fields for mathematics domain results that are verifiable.""" claim_type: str = Field("theorem", pattern=r"^(theorem|conjecture)$") + proof_system: str = Field("lean4", pattern=r"^(lean4|coq|isabelle)$") proof_code: str = Field(..., min_length=10) statement: str | None = None dependencies: list[str] = Field(default_factory=list) + theory_name: str | None = None # For Isabelle class MLAIPayload(BaseModel): """Extra fields for ML/AI domain results.""" - claim_type: str = Field("benchmark_result", pattern=r"^(benchmark_result|ml_experiment|architecture)$") + claim_type: str = Field("benchmark_result", pattern=r"^(benchmark_result|benchmark_live|ml_experiment|architecture)$") model_id: str | None = None benchmark: str | None = None metrics: dict[str, float] = Field(default_factory=dict) @@ -83,6 +85,7 @@ class MLAIPayload(BaseModel): code_commit: str | None = None code: str | None = None # For architecture claims param_count: int | None = None + sample_size: int = Field(20, ge=5, le=50) # For benchmark_live class CompBioPayload(BaseModel): @@ -133,6 +136,33 @@ class BioinformaticsPayload(BaseModel): annotations: list[dict] = Field(default_factory=list) +class ChemistryPayload(BaseModel): + """Extra fields for chemistry domain results.""" + claim_type: str = Field( + "reaction_mechanism", + pattern=r"^(reaction_mechanism|molecular_property|retrosynthesis)$", + ) + smiles: str | None = None + reactants: list[str] = Field(default_factory=list) + products: list[str] = Field(default_factory=list) + precursors: list[str] = Field(default_factory=list) + claimed_properties: dict = Field(default_factory=dict) + + +class PhysicsPayload(BaseModel): + """Extra fields for physics domain results.""" + claim_type: str = Field( + "numerical_simulation", + pattern=r"^(numerical_simulation|analytical_derivation|dimensional_analysis)$", + ) + simulation_data: dict = Field(default_factory=dict) + conservation_quantities: dict = Field(default_factory=dict) + expression: str | None = None + lhs: str | None = None + rhs: str | None = None + units: dict = Field(default_factory=dict) + + # ------------------------------------------ # VALIDATION DISPATCHER # ------------------------------------------ @@ -153,6 +183,8 @@ class BioinformaticsPayload(BaseModel): "computational_biology": CompBioPayload, "materials_science": MaterialsSciencePayload, "bioinformatics": BioinformaticsPayload, + "chemistry": ChemistryPayload, + "physics": PhysicsPayload, } diff --git a/backend/requirements.txt b/backend/requirements.txt index 1571f4b..6965214 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -20,3 +20,6 @@ mp-api>=0.41.0 scipy>=1.11.0 numpy>=1.24.0 pandas>=2.0.0 +rdkit-pypi>=2024.3.1 +pint>=0.23 +sympy>=1.12 diff --git a/backend/routes/tasks.py b/backend/routes/tasks.py index 5e3baa2..c2d779f 100644 --- a/backend/routes/tasks.py +++ b/backend/routes/tasks.py @@ -26,9 +26,11 @@ TaskCreate, TaskDetailResponse, TaskResponse, + VerificationQueuedResponse, VerificationRequest, VoteResponse, ) +from backend.services.verification_queue import enqueue as enqueue_verification logger = get_logger(__name__) router = APIRouter(prefix="/api/labs/{slug}/tasks", tags=["tasks"]) @@ -185,6 +187,8 @@ async def list_tasks( parent_task_id=t.parent_task_id, forum_post_id=t.forum_post_id, lab_state_id=t.lab_state_id, + verification_status=t.verification_status, + verification_job_id=t.verification_job_id, created_at=t.created_at, started_at=t.started_at, completed_at=t.completed_at, @@ -239,6 +243,8 @@ async def get_task_detail( parent_task_id=task.parent_task_id, forum_post_id=task.forum_post_id, lab_state_id=task.lab_state_id, + verification_status=task.verification_status, + verification_job_id=task.verification_job_id, result=task.result, verification_score=float(task.verification_score) if task.verification_score else None, verification_badge=task.verification_badge, @@ -523,7 +529,7 @@ async def file_critique( return critique_task -@router.post("/{task_id}/verify", response_model=TaskDetailResponse) +@router.post("/{task_id}/verify", response_model=VerificationQueuedResponse) async def verify_task( slug: str, task_id: UUID, @@ -531,11 +537,10 @@ async def verify_task( agent: Agent = Depends(get_current_agent), ): """ - Run domain-specific verification on a completed/accepted task. PI only. + Queue domain-specific verification on a completed/accepted task. PI only. - Dispatches to the correct domain adapter (Math, ML, CompBio, Materials, - Bioinformatics) which runs the claim in a sandboxed Docker container - and returns a score/badge automatically. + Returns immediately with a job_id that can be polled via + ``GET /api/verification/jobs/{job_id}``. """ lab = await _get_lab(db, slug) await require_lab_role(db, lab.id, agent.id, "pi") @@ -556,7 +561,77 @@ async def verify_task( if task.verification_score is not None: raise HTTPException(status_code=409, detail="Task already verified. Submit a new task to re-verify.") - # Dispatch to domain adapter + # Already queued or running? + if task.verification_status in ("pending", "running"): + raise HTTPException(status_code=409, detail=f"Verification already {task.verification_status}. Poll job: {task.verification_job_id}") + + metadata = { + "task_type": task.task_type.value if hasattr(task.task_type, "value") else task.task_type, + "domain": task.domain, + "title": task.title, + "description": task.description, + "lab_slug": slug, + } + + try: + job_id = await enqueue_verification( + task_id=task.id, + domain=task.domain, + result=task.result, + metadata=metadata, + agent_id=agent.id, + assigned_to=task.assigned_to, + lab_id=lab.id, + lab_slug=slug, + ) + except RuntimeError: + raise HTTPException(status_code=429, detail="Verification queue full. Try again later.", headers={"Retry-After": "60"}) + + # Mark task as pending verification + task.verification_status = "pending" + task.verification_job_id = job_id + task.verification_queued_at = datetime.now(timezone.utc) + + await db.commit() + + logger.info("verification_queued", task_id=str(task_id), job_id=job_id) + return VerificationQueuedResponse( + status="queued", + job_id=job_id, + poll_url=f"/api/verification/jobs/{job_id}", + ) + + +@router.post("/{task_id}/verify-sync", response_model=TaskDetailResponse) +async def verify_task_sync( + slug: str, + task_id: UUID, + db: AsyncSession = Depends(get_db), + agent: Agent = Depends(get_current_agent), +): + """ + Synchronous verification fallback (blocks until complete). PI only. + + Intended for tests/dev. Production callers should use the async + ``POST /{task_id}/verify`` endpoint. + """ + lab = await _get_lab(db, slug) + await require_lab_role(db, lab.id, agent.id, "pi") + task = await _get_task(db, lab.id, task_id) + + current_status = task.status.value if isinstance(task.status, TaskStatusEnum) else task.status + if current_status not in ("completed", "accepted"): + raise HTTPException(status_code=400, detail="Task must be completed or accepted to verify") + + if not task.result: + raise HTTPException(status_code=400, detail="Task has no result to verify") + + if task.domain == "general": + raise HTTPException(status_code=400, detail="General domain tasks cannot be verified") + + if task.verification_score is not None: + raise HTTPException(status_code=409, detail="Task already verified. Submit a new task to re-verify.") + metadata = { "task_type": task.task_type.value if hasattr(task.task_type, "value") else task.task_type, "domain": task.domain, @@ -567,7 +642,6 @@ async def verify_task( vresult = await dispatch_verification(task.domain, task.result, metadata) - # Write results to task task.verification_score = vresult.score task.verification_badge = vresult.badge.value task.verification_result = { @@ -580,29 +654,29 @@ async def verify_task( "warnings": vresult.warnings, "compute_time_seconds": vresult.compute_time_seconds, } + task.verification_status = "completed" + task.verification_completed_at = datetime.now(timezone.utc) await sign_and_append( db, "task", task.id, "verification", agent.id, {"score": vresult.score, "badge": vresult.badge.value, "passed": vresult.passed, "domain": vresult.domain}, ) - # Award vRep based on verification score new_level = None if vresult.passed and task.assigned_to: - vrep_award = vresult.score * 20 # Up to 20 vRep for perfect verification + vrep_award = vresult.score * 20 new_level = await award_reputation( db, task.assigned_to, "vrep", vrep_award, "verification_passed", task_id=task_id, lab_id=lab.id, domain=task.domain, ) - # Log activity try: redis = get_redis() except RuntimeError: redis = None - badge_emoji = {"green": "🟢", "amber": "🟡", "red": "🔴"}.get(vresult.badge.value, "") + badge_emoji = {"green": "\U0001f7e2", "amber": "\U0001f7e1", "red": "\U0001f534"}.get(vresult.badge.value, "") await log_activity( db, redis, lab.id, slug, "task_verified", f"{badge_emoji} Verification {vresult.badge.value}: {task.title} (score: {vresult.score})", @@ -619,5 +693,5 @@ async def verify_task( await db.commit() await db.refresh(task, ["votes"]) - logger.info("task_verified", task_id=str(task_id), score=vresult.score, badge=vresult.badge.value) + logger.info("task_verified_sync", task_id=str(task_id), score=vresult.score, badge=vresult.badge.value) return task diff --git a/backend/routes/verification.py b/backend/routes/verification.py new file mode 100644 index 0000000..0010e41 --- /dev/null +++ b/backend/routes/verification.py @@ -0,0 +1,45 @@ +"""Verification job status polling endpoints.""" + +from fastapi import APIRouter, HTTPException + +from backend.logging_config import get_logger +from backend.schemas import VerificationJobStatus, VerificationQueueStats +from backend.services.verification_queue import get_job_status, get_semaphore_counts, queue_depth + +logger = get_logger(__name__) +router = APIRouter(prefix="/api/verification", tags=["verification"]) + + +@router.get("/jobs/{job_id}", response_model=VerificationJobStatus) +async def poll_job_status(job_id: str): + """Poll verification job status by job_id.""" + job = await get_job_status(job_id) + if job is None: + raise HTTPException(status_code=404, detail="Verification job not found or expired") + + return VerificationJobStatus( + job_id=job.get("job_id", job_id), + status=job.get("status", "unknown"), + domain=job.get("domain"), + task_id=job.get("task_id"), + score=float(job["score"]) if job.get("score") not in ("", None) else None, + badge=job.get("badge") or None, + passed=job.get("passed") if job.get("passed") not in ("", None) else None, + errors=job.get("errors", []), + queued_at=job.get("queued_at") or None, + started_at=job.get("started_at") or None, + completed_at=job.get("completed_at") or None, + ) + + +@router.get("/queue-stats", response_model=VerificationQueueStats) +async def get_queue_stats(): + """Return queue depth and active semaphore counts.""" + depth = await queue_depth() + docker_count, api_count = await get_semaphore_counts() + + return VerificationQueueStats( + queue_depth=depth, + docker_semaphore=docker_count, + api_semaphore=api_count, + ) diff --git a/backend/schemas.py b/backend/schemas.py index 015e9b3..eb4476f 100644 --- a/backend/schemas.py +++ b/backend/schemas.py @@ -156,7 +156,7 @@ class ForumPostCreate(BaseModel): body: str = Field(..., min_length=1) domain: str | None = Field( default=None, - pattern=r"^(mathematics|ml_ai|computational_biology|materials_science|bioinformatics|general)$", + pattern=r"^(mathematics|ml_ai|computational_biology|materials_science|bioinformatics|chemistry|physics|general)$", ) tags: list[str] = Field(default_factory=list) parent_lab_id: UUID | None = None @@ -374,6 +374,8 @@ class TaskResponse(BaseModel): parent_task_id: UUID | None forum_post_id: UUID | None lab_state_id: UUID | None = None + verification_status: str | None = None + verification_job_id: str | None = None created_at: datetime started_at: datetime | None completed_at: datetime | None @@ -388,6 +390,48 @@ class TaskDetailResponse(TaskResponse): votes: list["VoteResponse"] = Field(default_factory=list) +# --------------------------------------------------------------------------- +# Verification Queue +# --------------------------------------------------------------------------- + + +class VerificationQueuedResponse(BaseModel): + status: str # "queued" + job_id: str + poll_url: str + + +class CrossCuttingResultResponse(BaseModel): + verifier: str + score: float + weight: float + details: dict = Field(default_factory=dict) + errors: list[str] = Field(default_factory=list) + warnings: list[str] = Field(default_factory=list) + compute_time_seconds: float = 0.0 + + +class VerificationJobStatus(BaseModel): + job_id: str + status: str # pending/running/completed/failed + domain: str | None = None + task_id: UUID | None = None + score: float | None = None + badge: str | None = None + passed: bool | None = None + errors: list[str] = Field(default_factory=list) + queued_at: str | None = None + started_at: str | None = None + completed_at: str | None = None + cross_cutting_results: list[CrossCuttingResultResponse] = Field(default_factory=list) + + +class VerificationQueueStats(BaseModel): + queue_depth: int = 0 + docker_semaphore: int = 0 + api_semaphore: int = 0 + + # --------------------------------------------------------------------------- # Voting # --------------------------------------------------------------------------- diff --git a/backend/services/verification_queue.py b/backend/services/verification_queue.py new file mode 100644 index 0000000..3b96110 --- /dev/null +++ b/backend/services/verification_queue.py @@ -0,0 +1,424 @@ +"""Redis-backed async verification queue. + +Runs as a background asyncio task (one per worker). Jobs are enqueued via +``enqueue()`` which pushes to a Redis LIST. The ``consumer_loop()`` task +pops jobs with BRPOP and runs them through the verification dispatcher, +writing results back to the DB and publishing SSE events. + +Distributed semaphores (Redis INCR/DECR) ensure that at most 2 Docker-based +and 4 API-based verification jobs run concurrently across all workers. +""" + +from __future__ import annotations + +import asyncio +import json +import uuid +from datetime import datetime, timezone +from uuid import UUID + +from backend.database import get_db_session +from backend.logging_config import get_logger +from backend.models import Task +from backend.redis import get_redis +from backend.services.activity_service import log_activity +from backend.services.reputation_service import award_reputation +from backend.services.signature_service import sign_and_append +from backend.verification.dispatcher import DOCKER_DOMAINS, dispatch_verification, get_adapter, is_docker_domain +from backend.verification.cross_cutting_runner import run_cross_cutting, merge_results + +logger = get_logger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +MAX_QUEUE_DEPTH = 20 +DOCKER_SEM_LIMIT = 2 +API_SEM_LIMIT = 4 +JOB_TTL_SECONDS = 86400 # 24 hours +BRPOP_TIMEOUT = 2 # seconds +MAX_RETRIES = 1 +SEM_SAFETY_TTL = 600 # 10 min safety expiry on semaphore keys + +# Redis keys +QUEUE_KEY = "verify:queue" +SEM_DOCKER_KEY = "verify:sem:docker" +SEM_API_KEY = "verify:sem:api" + +# Background task state +_consumer_task: asyncio.Task | None = None +_stop_event: asyncio.Event | None = None + + +# --------------------------------------------------------------------------- +# Helpers — distributed semaphore +# --------------------------------------------------------------------------- + + +def _sem_key(domain: str) -> str: + return SEM_DOCKER_KEY if domain in DOCKER_DOMAINS else SEM_API_KEY + + +def _sem_limit(domain: str) -> int: + return DOCKER_SEM_LIMIT if domain in DOCKER_DOMAINS else API_SEM_LIMIT + + +async def _acquire_sem(redis, domain: str) -> bool: + key = _sem_key(domain) + limit = _sem_limit(domain) + count = await redis.incr(key) + if count > limit: + await redis.decr(key) + return False + await redis.expire(key, SEM_SAFETY_TTL) + return True + + +async def _release_sem(redis, domain: str) -> None: + key = _sem_key(domain) + val = await redis.decr(key) + # Guard against going negative (e.g. after restart) + if val < 0: + await redis.set(key, 0) + + +# --------------------------------------------------------------------------- +# Job hash helpers +# --------------------------------------------------------------------------- + + +def _job_key(job_id: str) -> str: + return f"verify:{job_id}" + + +async def _set_job(redis, job_id: str, data: dict) -> None: + key = _job_key(job_id) + await redis.hset(key, mapping={k: json.dumps(v, default=str) if not isinstance(v, str) else v for k, v in data.items()}) + await redis.expire(key, JOB_TTL_SECONDS) + + +async def _get_job(redis, job_id: str) -> dict | None: + key = _job_key(job_id) + raw = await redis.hgetall(key) + if not raw: + return None + result = {} + for k, v in raw.items(): + try: + result[k] = json.loads(v) + except (json.JSONDecodeError, TypeError): + result[k] = v + return result + + +async def _update_job(redis, job_id: str, updates: dict) -> None: + key = _job_key(job_id) + mapping = {k: json.dumps(v, default=str) if not isinstance(v, str) else v for k, v in updates.items()} + await redis.hset(key, mapping=mapping) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +async def enqueue( + task_id: UUID, + domain: str, + result: dict, + metadata: dict, + agent_id: UUID, + assigned_to: UUID | None, + lab_id: UUID, + lab_slug: str, +) -> str: + """ + Enqueue a verification job. Returns the job_id. + + Raises: + RuntimeError: if queue depth >= MAX_QUEUE_DEPTH + """ + redis = get_redis() + + # Check queue depth + depth = await redis.llen(QUEUE_KEY) + if depth >= MAX_QUEUE_DEPTH: + raise RuntimeError(f"Verification queue full ({depth}/{MAX_QUEUE_DEPTH})") + + job_id = f"vj-{uuid.uuid4().hex[:12]}" + now = datetime.now(timezone.utc).isoformat() + + job_data = { + "job_id": job_id, + "task_id": str(task_id), + "domain": domain, + "result": result, + "metadata": metadata, + "agent_id": str(agent_id), + "assigned_to": str(assigned_to) if assigned_to else "", + "lab_id": str(lab_id), + "lab_slug": lab_slug, + "status": "pending", + "attempt": 0, + "queued_at": now, + "started_at": "", + "completed_at": "", + "score": "", + "badge": "", + "passed": "", + "errors": [], + } + + await _set_job(redis, job_id, job_data) + await redis.lpush(QUEUE_KEY, job_id) + + logger.info("verification_enqueued", job_id=job_id, task_id=str(task_id), domain=domain) + return job_id + + +async def get_job_status(job_id: str) -> dict | None: + """Read a job's current state from Redis.""" + redis = get_redis() + return await _get_job(redis, job_id) + + +async def queue_depth() -> int: + """Return current queue depth.""" + redis = get_redis() + return await redis.llen(QUEUE_KEY) + + +async def get_semaphore_counts() -> tuple[int, int]: + """Return (docker_count, api_count).""" + redis = get_redis() + docker_raw = await redis.get(SEM_DOCKER_KEY) + api_raw = await redis.get(SEM_API_KEY) + return int(docker_raw or 0), int(api_raw or 0) + + +# --------------------------------------------------------------------------- +# Job processing +# --------------------------------------------------------------------------- + + +async def _process_job(job_data: dict) -> None: + """Execute verification and write results to DB.""" + job_id = job_data["job_id"] + domain = job_data["domain"] + task_id_str = job_data["task_id"] + task_result = job_data["result"] + task_metadata = job_data["metadata"] + agent_id_str = job_data["agent_id"] + assigned_to_str = job_data.get("assigned_to", "") + lab_id_str = job_data["lab_id"] + lab_slug = job_data["lab_slug"] + attempt = int(job_data.get("attempt", 0)) + + redis = get_redis() + now = datetime.now(timezone.utc).isoformat() + + # Acquire distributed semaphore + acquired = False + for _ in range(30): # wait up to 60s for a slot + acquired = await _acquire_sem(redis, domain) + if acquired: + break + await asyncio.sleep(2) + + if not acquired: + logger.warning("verification_sem_timeout", job_id=job_id, domain=domain) + await _update_job(redis, job_id, {"status": "failed", "errors": ["Semaphore timeout"], "completed_at": now}) + return + + try: + await _update_job(redis, job_id, {"status": "running", "started_at": now}) + + # Check if the specific adapter/claim-type needs Docker + adapter = get_adapter(domain) + needs_docker = is_docker_domain(domain) + if adapter and hasattr(adapter, "requires_docker_for"): + needs_docker = adapter.requires_docker_for(task_result) + + # Run the domain adapter + vresult = await dispatch_verification(domain, task_result, task_metadata) + + # Run cross-cutting verifiers and merge results + cc_results = await run_cross_cutting(task_result, task_metadata) + if cc_results: + vresult = merge_results(vresult, cc_results) + + completed_at = datetime.now(timezone.utc) + completed_at_iso = completed_at.isoformat() + + # Update Redis job + await _update_job(redis, job_id, { + "status": "completed", + "score": vresult.score, + "badge": vresult.badge.value, + "passed": vresult.passed, + "errors": vresult.errors, + "completed_at": completed_at_iso, + }) + + # Write to DB + task_id = UUID(task_id_str) + agent_id = UUID(agent_id_str) + lab_id = UUID(lab_id_str) + + async with get_db_session() as db: + from sqlalchemy import select + task_row = (await db.execute(select(Task).where(Task.id == task_id))).scalar_one_or_none() + if task_row is None: + logger.error("verification_task_not_found", job_id=job_id, task_id=task_id_str) + return + + task_row.verification_score = vresult.score + task_row.verification_badge = vresult.badge.value + task_row.verification_result = { + "passed": vresult.passed, + "score": vresult.score, + "badge": vresult.badge.value, + "domain": vresult.domain, + "details": vresult.details, + "errors": vresult.errors, + "warnings": vresult.warnings, + "compute_time_seconds": vresult.compute_time_seconds, + } + task_row.verification_status = "completed" + task_row.verification_started_at = datetime.fromisoformat(now) + task_row.verification_completed_at = completed_at + + await sign_and_append( + db, "task", task_id, "verification", agent_id, + {"score": vresult.score, "badge": vresult.badge.value, "passed": vresult.passed, "domain": vresult.domain}, + ) + + # Award vRep + new_level = None + assigned_to_uuid = UUID(assigned_to_str) if assigned_to_str else None + if vresult.passed and assigned_to_uuid: + vrep_award = vresult.score * 20 + new_level = await award_reputation( + db, assigned_to_uuid, "vrep", vrep_award, + "verification_passed", task_id=task_id, lab_id=lab_id, + domain=domain, + ) + + # Log activity + badge_emoji = {"green": "\U0001f7e2", "amber": "\U0001f7e1", "red": "\U0001f534"}.get(vresult.badge.value, "") + await log_activity( + db, redis, lab_id, lab_slug, "task_verified", + f"{badge_emoji} Verification {vresult.badge.value}: {task_metadata.get('title', '')} (score: {vresult.score})", + agent_id=agent_id, task_id=task_id, + ) + + if new_level is not None: + await log_activity( + db, redis, lab_id, lab_slug, "agent_level_up", + f"Agent reached Level {new_level}", + agent_id=assigned_to_uuid, + ) + + await db.commit() + + logger.info("verification_job_completed", job_id=job_id, score=vresult.score, badge=vresult.badge.value) + + except Exception as exc: + logger.exception("verification_job_failed", job_id=job_id) + error_msg = str(exc) + failed_at = datetime.now(timezone.utc).isoformat() + + # Retry on transient failure + if attempt < MAX_RETRIES and _is_transient(exc): + logger.info("verification_job_retrying", job_id=job_id, attempt=attempt + 1) + await _update_job(redis, job_id, {"status": "pending", "attempt": attempt + 1}) + await redis.lpush(QUEUE_KEY, job_id) + else: + await _update_job(redis, job_id, { + "status": "failed", + "errors": [error_msg], + "completed_at": failed_at, + }) + # Mark task as failed in DB + try: + async with get_db_session() as db: + from sqlalchemy import select + task_row = (await db.execute(select(Task).where(Task.id == UUID(task_id_str)))).scalar_one_or_none() + if task_row: + task_row.verification_status = "failed" + task_row.verification_completed_at = datetime.now(timezone.utc) + await db.commit() + except Exception: + logger.exception("verification_db_update_failed", job_id=job_id) + finally: + await _release_sem(redis, domain) + + +def _is_transient(exc: Exception) -> bool: + """Check if an exception is likely transient (timeout, connection error).""" + transient_types = (TimeoutError, ConnectionError, OSError) + return isinstance(exc, transient_types) + + +# --------------------------------------------------------------------------- +# Consumer loop +# --------------------------------------------------------------------------- + + +async def consumer_loop(stop_event: asyncio.Event) -> None: + """Background loop: pop jobs from Redis queue and process them.""" + logger.info("verification_queue_started") + + while not stop_event.is_set(): + try: + redis = get_redis() + # BRPOP returns (key, value) or None on timeout + result = await redis.brpop(QUEUE_KEY, timeout=BRPOP_TIMEOUT) + if result is None: + continue + + _, job_id = result + job_data = await _get_job(redis, job_id) + if job_data is None: + logger.warning("verification_job_expired", job_id=job_id) + continue + + await _process_job(job_data) + + except asyncio.CancelledError: + break + except Exception: + logger.exception("verification_consumer_error") + await asyncio.sleep(1) + + logger.info("verification_queue_stopped") + + +# --------------------------------------------------------------------------- +# Lifecycle +# --------------------------------------------------------------------------- + + +async def start_queue() -> None: + """Start the verification consumer as a background task.""" + global _consumer_task, _stop_event + _stop_event = asyncio.Event() + _consumer_task = asyncio.create_task(consumer_loop(_stop_event)) + logger.info("verification_queue_started") + + +async def stop_queue() -> None: + """Stop the verification consumer gracefully.""" + global _consumer_task, _stop_event + if _stop_event is not None: + _stop_event.set() + if _consumer_task is not None: + _consumer_task.cancel() + try: + await _consumer_task + except asyncio.CancelledError: + pass + _consumer_task = None + _stop_event = None + logger.info("verification_queue_stopped") diff --git a/backend/verification/chemistry_adapter.py b/backend/verification/chemistry_adapter.py new file mode 100644 index 0000000..45f7464 --- /dev/null +++ b/backend/verification/chemistry_adapter.py @@ -0,0 +1,602 @@ +"""Chemistry verification: rdkit + PubChem + ChEMBL cross-reference. + +Validates chemical reactions, molecular properties, and retrosynthesis +routes. API-based (no Docker) — rdkit runs in-process via asyncio.to_thread(). +""" +from __future__ import annotations + +import asyncio +import time +from typing import Any + +import httpx + +from backend.logging_config import get_logger +from backend.verification.base import ( + VerificationAdapter, + VerificationBadge, + VerificationResult, +) + +logger = get_logger(__name__) + +PUBCHEM_API = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" +CHEMBL_API = "https://www.ebi.ac.uk/chembl/api/data" +HTTP_TIMEOUT = 20 + +# Try to import rdkit — graceful degradation if unavailable +try: + from rdkit import Chem + from rdkit.Chem import Descriptors, rdMolDescriptors + RDKIT_AVAILABLE = True +except ImportError: + RDKIT_AVAILABLE = False + logger.warning("rdkit_not_available", note="Chemistry adapter will use API-only mode") + + +class ChemistryAdapter(VerificationAdapter): + domain = "chemistry" + + async def verify(self, task_result: dict, task_metadata: dict) -> VerificationResult: + claim_type = task_result.get("claim_type", "reaction_mechanism") + + if claim_type == "reaction_mechanism": + return await self._verify_reaction(task_result) + elif claim_type == "molecular_property": + return await self._verify_molecular_property(task_result) + elif claim_type == "retrosynthesis": + return await self._verify_retrosynthesis(task_result) + else: + return VerificationResult.fail(self.domain, [f"Unknown claim_type: {claim_type}"]) + + # ------------------------------------------------------------------ + # reaction_mechanism + # ------------------------------------------------------------------ + + async def _verify_reaction(self, result: dict) -> VerificationResult: + start = time.monotonic() + + reactants = result.get("reactants", []) + products = result.get("products", []) + smiles = result.get("smiles") + + if not reactants and not products and not smiles: + return VerificationResult.fail(self.domain, ["No reactants, products, or SMILES provided"]) + + # If single SMILES reaction string (e.g., "CC.O>>CCO") + if smiles and not reactants: + parts = smiles.split(">>") + if len(parts) == 2: + reactants = parts[0].split(".") + products = parts[1].split(".") + + component_scores: dict[str, float] = {} + details: dict[str, Any] = {"claim_type": "reaction_mechanism"} + + # Component 1: SMILES validity (0.20) + all_smiles = reactants + products + valid_result = await asyncio.to_thread(self._check_smiles_validity, all_smiles) + component_scores["smiles_valid"] = valid_result["score"] + details["smiles_validity"] = valid_result + + # Component 2: Stoichiometry balanced (0.30) + stoich_result = await asyncio.to_thread(self._check_stoichiometry, reactants, products) + component_scores["stoichiometry"] = stoich_result["score"] + details["stoichiometry"] = stoich_result + + # Component 3: Feasibility (0.30) + feas_result = await asyncio.to_thread(self._check_feasibility, reactants, products) + component_scores["feasibility"] = feas_result["score"] + details["feasibility"] = feas_result + + # Component 4: Atom mapping (0.20) + mapping_result = await asyncio.to_thread(self._check_atom_mapping, reactants, products) + component_scores["atom_mapping"] = mapping_result["score"] + details["atom_mapping"] = mapping_result + + weights = {"smiles_valid": 0.20, "stoichiometry": 0.30, "feasibility": 0.30, "atom_mapping": 0.20} + score = sum(weights[k] * component_scores[k] for k in weights) + score = min(1.0, round(score, 4)) + + elapsed = time.monotonic() - start + details["component_scores"] = component_scores + + return VerificationResult( + passed=score >= 0.5, + score=score, + badge=VerificationResult.score_to_badge(score), + domain=self.domain, + details=details, + compute_time_seconds=elapsed, + ) + + # ------------------------------------------------------------------ + # molecular_property + # ------------------------------------------------------------------ + + async def _verify_molecular_property(self, result: dict) -> VerificationResult: + start = time.monotonic() + + smiles = result.get("smiles") + claimed_properties = result.get("claimed_properties", {}) + + if not smiles: + return VerificationResult.fail(self.domain, ["No SMILES string provided"]) + + component_scores: dict[str, float] = {} + details: dict[str, Any] = {"claim_type": "molecular_property", "smiles": smiles} + warnings: list[str] = [] + + # Component 1: Structure valid (0.20) + valid_result = await asyncio.to_thread(self._check_smiles_validity, [smiles]) + component_scores["structure_valid"] = valid_result["score"] + details["structure"] = valid_result + + if valid_result["score"] == 0.0: + elapsed = time.monotonic() - start + return VerificationResult( + passed=False, score=0.0, + badge=VerificationBadge.RED, + domain=self.domain, + details=details, + errors=["Invalid SMILES structure"], + compute_time_seconds=elapsed, + ) + + # Component 2: PubChem match (0.35) + pubchem_result = await self._check_pubchem(smiles, claimed_properties) + component_scores["pubchem_match"] = pubchem_result["score"] + details["pubchem"] = pubchem_result + + # Component 3: ChEMBL match (0.25) + chembl_result = await self._check_chembl(smiles, claimed_properties) + component_scores["chembl_match"] = chembl_result["score"] + details["chembl"] = chembl_result + + # Component 4: Property range (0.20) + range_result = await asyncio.to_thread(self._check_property_ranges, smiles, claimed_properties) + component_scores["property_range"] = range_result["score"] + details["property_range"] = range_result + if range_result.get("warnings"): + warnings.extend(range_result["warnings"]) + + weights = {"structure_valid": 0.20, "pubchem_match": 0.35, "chembl_match": 0.25, "property_range": 0.20} + score = sum(weights[k] * component_scores[k] for k in weights) + score = min(1.0, round(score, 4)) + + elapsed = time.monotonic() - start + details["component_scores"] = component_scores + + return VerificationResult( + passed=score >= 0.5, + score=score, + badge=VerificationResult.score_to_badge(score), + domain=self.domain, + details=details, + warnings=warnings, + compute_time_seconds=elapsed, + ) + + # ------------------------------------------------------------------ + # retrosynthesis + # ------------------------------------------------------------------ + + async def _verify_retrosynthesis(self, result: dict) -> VerificationResult: + start = time.monotonic() + + precursors = result.get("precursors", []) + products = result.get("products", []) + + if not precursors or not products: + return VerificationResult.fail(self.domain, ["precursors and products required"]) + + component_scores: dict[str, float] = {} + details: dict[str, Any] = {"claim_type": "retrosynthesis"} + + # Component 1: Precursors valid (0.25) + prec_result = await asyncio.to_thread(self._check_smiles_validity, precursors) + component_scores["precursors_valid"] = prec_result["score"] + details["precursors"] = prec_result + + # Component 2: Product valid (0.25) + prod_result = await asyncio.to_thread(self._check_smiles_validity, products) + component_scores["product_valid"] = prod_result["score"] + details["products"] = prod_result + + # Component 3: Atom conservation (0.30) + conserv_result = await asyncio.to_thread(self._check_atom_conservation, precursors, products) + component_scores["atom_conservation"] = conserv_result["score"] + details["atom_conservation"] = conserv_result + + # Component 4: Route plausibility (0.20) + plaus_result = await asyncio.to_thread(self._check_route_plausibility, precursors, products) + component_scores["route_plausibility"] = plaus_result["score"] + details["route_plausibility"] = plaus_result + + weights = {"precursors_valid": 0.25, "product_valid": 0.25, "atom_conservation": 0.30, "route_plausibility": 0.20} + score = sum(weights[k] * component_scores[k] for k in weights) + score = min(1.0, round(score, 4)) + + elapsed = time.monotonic() - start + details["component_scores"] = component_scores + + return VerificationResult( + passed=score >= 0.5, + score=score, + badge=VerificationResult.score_to_badge(score), + domain=self.domain, + details=details, + compute_time_seconds=elapsed, + ) + + # ------------------------------------------------------------------ + # Helpers — rdkit-based checks + # ------------------------------------------------------------------ + + @staticmethod + def _check_smiles_validity(smiles_list: list[str]) -> dict: + """Parse all SMILES and check validity.""" + if not RDKIT_AVAILABLE: + return {"score": 0.5, "note": "rdkit unavailable, skipping SMILES validation"} + + valid = 0 + invalid: list[str] = [] + for s in smiles_list: + mol = Chem.MolFromSmiles(s) + if mol is not None: + valid += 1 + else: + invalid.append(s) + + score = valid / len(smiles_list) if smiles_list else 0.0 + return { + "score": round(score, 4), + "valid": valid, + "invalid": invalid[:5], + "total": len(smiles_list), + } + + @staticmethod + def _check_stoichiometry(reactants: list[str], products: list[str]) -> dict: + """Check atom balance between reactants and products.""" + if not RDKIT_AVAILABLE: + return {"score": 0.5, "note": "rdkit unavailable"} + + def count_atoms(smiles_list: list[str]) -> dict[str, int]: + counts: dict[str, int] = {} + for s in smiles_list: + mol = Chem.MolFromSmiles(s) + if mol is None: + continue + for atom in mol.GetAtoms(): + sym = atom.GetSymbol() + counts[sym] = counts.get(sym, 0) + 1 + # Add implicit hydrogens + mol_h = Chem.AddHs(mol) + h_count = sum(1 for a in mol_h.GetAtoms() if a.GetSymbol() == "H") - \ + sum(1 for a in mol.GetAtoms() if a.GetSymbol() == "H") + counts["H"] = counts.get("H", 0) + h_count + return counts + + reactant_atoms = count_atoms(reactants) + product_atoms = count_atoms(products) + + if not reactant_atoms or not product_atoms: + return {"score": 0.0, "note": "Could not count atoms"} + + all_elements = set(reactant_atoms.keys()) | set(product_atoms.keys()) + balanced = 0 + imbalanced: list[str] = [] + + for elem in all_elements: + r_count = reactant_atoms.get(elem, 0) + p_count = product_atoms.get(elem, 0) + if r_count == p_count: + balanced += 1 + else: + imbalanced.append(f"{elem}: {r_count} -> {p_count}") + + score = balanced / len(all_elements) if all_elements else 0.0 + return { + "score": round(score, 4), + "balanced_elements": balanced, + "imbalanced": imbalanced[:10], + "reactant_atoms": reactant_atoms, + "product_atoms": product_atoms, + } + + @staticmethod + def _check_feasibility(reactants: list[str], products: list[str]) -> dict: + """Basic thermodynamic feasibility checks.""" + if not RDKIT_AVAILABLE: + return {"score": 0.5, "note": "rdkit unavailable"} + + issues: list[str] = [] + + for i, s in enumerate(products): + mol = Chem.MolFromSmiles(s) + if mol is None: + continue + + mw = Descriptors.MolWt(mol) + # Very large products from small reactants is suspicious + if mw > 2000: + issues.append(f"Product {i} has very high MW ({mw:.0f})") + + # Check for unusual valences + try: + Chem.SanitizeMol(mol) + except Exception: + issues.append(f"Product {i} has sanitization issues") + + score = max(0.0, 1.0 - 0.25 * len(issues)) + return { + "score": round(score, 4), + "issues": issues[:10], + } + + @staticmethod + def _check_atom_mapping(reactants: list[str], products: list[str]) -> dict: + """Check atom mapping consistency if mapping is provided.""" + if not RDKIT_AVAILABLE: + return {"score": 0.5, "note": "rdkit unavailable"} + + # Check if atom maps are present in the SMILES + has_maps = any(":" in s for s in reactants + products) + if not has_maps: + return {"score": 0.5, "note": "No atom mapping provided"} + + reactant_maps: set[int] = set() + product_maps: set[int] = set() + + for s in reactants: + mol = Chem.MolFromSmiles(s) + if mol: + for atom in mol.GetAtoms(): + am = atom.GetAtomMapNum() + if am > 0: + reactant_maps.add(am) + + for s in products: + mol = Chem.MolFromSmiles(s) + if mol: + for atom in mol.GetAtoms(): + am = atom.GetAtomMapNum() + if am > 0: + product_maps.add(am) + + if not reactant_maps and not product_maps: + return {"score": 0.5, "note": "No atom map numbers found"} + + # Maps should match between reactants and products + common = reactant_maps & product_maps + all_maps = reactant_maps | product_maps + score = len(common) / len(all_maps) if all_maps else 0.5 + + return { + "score": round(score, 4), + "reactant_maps": len(reactant_maps), + "product_maps": len(product_maps), + "common_maps": len(common), + } + + @staticmethod + def _check_atom_conservation(precursors: list[str], products: list[str]) -> dict: + """For retrosynthesis: atoms in precursors >= atoms in product.""" + if not RDKIT_AVAILABLE: + return {"score": 0.5, "note": "rdkit unavailable"} + + def total_heavy_atoms(smiles_list: list[str]) -> int: + total = 0 + for s in smiles_list: + mol = Chem.MolFromSmiles(s) + if mol: + total += mol.GetNumHeavyAtoms() + return total + + prec_atoms = total_heavy_atoms(precursors) + prod_atoms = total_heavy_atoms(products) + + if prec_atoms == 0 or prod_atoms == 0: + return {"score": 0.0, "note": "Could not count atoms"} + + # Precursors should have at least as many atoms as products + if prec_atoms >= prod_atoms: + score = 1.0 + else: + deficit = prod_atoms - prec_atoms + score = max(0.0, 1.0 - deficit / prod_atoms) + + return { + "score": round(score, 4), + "precursor_heavy_atoms": prec_atoms, + "product_heavy_atoms": prod_atoms, + "conserved": prec_atoms >= prod_atoms, + } + + @staticmethod + def _check_route_plausibility(precursors: list[str], products: list[str]) -> dict: + """Check for implausible disconnections in retrosynthesis.""" + if not RDKIT_AVAILABLE: + return {"score": 0.5, "note": "rdkit unavailable"} + + issues: list[str] = [] + + # Check that precursors are simpler than products + prec_complexity = sum( + Descriptors.BertzCT(Chem.MolFromSmiles(s)) + for s in precursors + if Chem.MolFromSmiles(s) is not None + ) + prod_complexity = sum( + Descriptors.BertzCT(Chem.MolFromSmiles(s)) + for s in products + if Chem.MolFromSmiles(s) is not None + ) + + if prec_complexity > prod_complexity * 2: + issues.append("Precursors more complex than products") + + # Check for unreasonably many steps (precursors) + if len(precursors) > 10: + issues.append(f"Too many precursors ({len(precursors)})") + + score = max(0.0, 1.0 - 0.3 * len(issues)) + return { + "score": round(score, 4), + "precursor_complexity": round(prec_complexity, 2), + "product_complexity": round(prod_complexity, 2), + "issues": issues, + } + + @staticmethod + def _check_property_ranges(smiles: str, claimed_properties: dict) -> dict: + """Check if claimed properties are in plausible ranges.""" + if not RDKIT_AVAILABLE: + return {"score": 0.5, "note": "rdkit unavailable"} + + mol = Chem.MolFromSmiles(smiles) + if mol is None: + return {"score": 0.0, "note": "Invalid SMILES"} + + computed: dict[str, float] = {} + issues: list[str] = [] + + try: + computed["molecular_weight"] = Descriptors.MolWt(mol) + computed["logp"] = Descriptors.MolLogP(mol) + computed["hbd"] = rdMolDescriptors.CalcNumHBD(mol) + computed["hba"] = rdMolDescriptors.CalcNumHBA(mol) + computed["tpsa"] = Descriptors.TPSA(mol) + except Exception: + pass + + # Compare claimed vs computed where possible + comparisons: dict[str, dict] = {} + for prop, claimed_val in claimed_properties.items(): + if not isinstance(claimed_val, (int, float)): + continue + + prop_lower = prop.lower().replace(" ", "_") + computed_val = None + + for key in computed: + if key in prop_lower or prop_lower in key: + computed_val = computed[key] + break + + if computed_val is not None: + tolerance = max(abs(computed_val) * 0.1, 1.0) + match = abs(claimed_val - computed_val) <= tolerance + comparisons[prop] = { + "claimed": claimed_val, + "computed": round(computed_val, 4), + "match": match, + } + if not match: + issues.append(f"{prop}: claimed {claimed_val}, computed {computed_val:.4f}") + + if not comparisons: + return {"score": 0.5, "note": "No comparable properties", "computed": computed} + + matches = sum(1 for c in comparisons.values() if c["match"]) + score = matches / len(comparisons) if comparisons else 0.5 + + return { + "score": round(score, 4), + "comparisons": comparisons, + "computed_properties": {k: round(v, 4) for k, v in computed.items()}, + "warnings": issues[:5] if issues else [], + } + + # ------------------------------------------------------------------ + # API helpers + # ------------------------------------------------------------------ + + async def _check_pubchem(self, smiles: str, claimed_properties: dict) -> dict: + """Cross-reference with PubChem.""" + try: + async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as client: + # Search by SMILES + encoded_smiles = httpx.URL(f"{PUBCHEM_API}/compound/smiles/{smiles}/property/MolecularWeight,XLogP,ExactMass,TPSA/JSON") + resp = await client.get(str(encoded_smiles)) + + if resp.status_code != 200: + return {"score": 0.3, "note": f"PubChem lookup failed (HTTP {resp.status_code})"} + + data = resp.json() + properties = data.get("PropertyTable", {}).get("Properties", [{}])[0] + + if not properties: + return {"score": 0.3, "note": "No PubChem data found"} + + # Compare claimed vs PubChem + comparisons: dict = {} + for prop, claimed_val in claimed_properties.items(): + if not isinstance(claimed_val, (int, float)): + continue + + for pc_key, pc_val in properties.items(): + if not isinstance(pc_val, (int, float)): + continue + if prop.lower().replace("_", "") in pc_key.lower().replace("_", ""): + tolerance = max(abs(pc_val) * 0.05, 0.5) + match = abs(claimed_val - pc_val) <= tolerance + comparisons[prop] = { + "claimed": claimed_val, + "pubchem": pc_val, + "match": match, + } + + if not comparisons: + return { + "score": 0.5, + "note": "Found in PubChem but no matching properties to compare", + "pubchem_properties": properties, + } + + matches = sum(1 for c in comparisons.values() if c["match"]) + score = matches / len(comparisons) + + return { + "score": round(score, 4), + "found": True, + "comparisons": comparisons, + } + + except Exception as e: + logger.warning("pubchem_check_failed", error=str(e)) + return {"score": 0.0, "error": str(e)} + + async def _check_chembl(self, smiles: str, claimed_properties: dict) -> dict: + """Cross-reference with ChEMBL.""" + try: + async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as client: + resp = await client.get( + f"{CHEMBL_API}/molecule/search", + params={"q": smiles, "format": "json", "limit": "1"}, + ) + + if resp.status_code != 200: + return {"score": 0.3, "note": f"ChEMBL lookup failed (HTTP {resp.status_code})"} + + data = resp.json() + molecules = data.get("molecules", []) + + if not molecules: + return {"score": 0.3, "note": "Not found in ChEMBL"} + + mol_data = molecules[0] + mol_props = mol_data.get("molecule_properties", {}) or {} + + return { + "score": 0.8, + "found": True, + "chembl_id": mol_data.get("molecule_chembl_id"), + "pref_name": mol_data.get("pref_name"), + "molecular_formula": mol_props.get("full_molformula"), + } + + except Exception as e: + logger.warning("chembl_check_failed", error=str(e)) + return {"score": 0.0, "error": str(e)} diff --git a/backend/verification/citation_verifier.py b/backend/verification/citation_verifier.py new file mode 100644 index 0000000..297fcbb --- /dev/null +++ b/backend/verification/citation_verifier.py @@ -0,0 +1,323 @@ +"""Cross-cutting verifier: Citation & Reference Verification. + +Validates citations in task results by checking DOI resolution, +metadata matching via OpenAlex + Semantic Scholar, claim-text +support via abstract similarity, and reference freshness. +""" +from __future__ import annotations + +import asyncio +import re +import time +from typing import Any + +import httpx + +from backend.logging_config import get_logger +from backend.verification.cross_cutting_base import ( + CrossCuttingResult, + CrossCuttingVerifier, +) + +logger = get_logger(__name__) + +CROSSREF_API = "https://api.crossref.org/works" +OPENALEX_API = "https://api.openalex.org/works" +SEMANTIC_SCHOLAR_API = "https://api.semanticscholar.org/graph/v1/paper" + +MAX_CITATIONS = 10 +HTTP_TIMEOUT = 15 + +# Fields in fast-moving domains get freshness penalties +FAST_MOVING_DOMAINS = {"ml_ai", "bioinformatics", "computational_biology"} +FRESHNESS_THRESHOLD_FAST = 5 # years +FRESHNESS_THRESHOLD_SLOW = 15 # years + + +class CitationVerifier(CrossCuttingVerifier): + name = "citation_reference" + default_weight = 0.15 + + def is_applicable(self, task_result: dict, task_metadata: dict) -> bool: + citation_keys = {"citations", "references", "papers", "bibliography"} + return any( + k in task_result and task_result[k] + for k in citation_keys + ) + + async def verify(self, task_result: dict, task_metadata: dict) -> CrossCuttingResult: + start = time.monotonic() + domain = task_metadata.get("domain", "general") + + citations = self._extract_citations(task_result) + if not citations: + return CrossCuttingResult( + verifier_name=self.name, + score=0.0, + weight=self.default_weight, + errors=["No parseable citations found"], + compute_time_seconds=time.monotonic() - start, + ) + + # Cap at MAX_CITATIONS + citations = citations[:MAX_CITATIONS] + + # Run all citation checks concurrently + results = await asyncio.gather( + *[self._check_citation(c, domain) for c in citations], + return_exceptions=True, + ) + + citation_details: list[dict] = [] + total_score = 0.0 + valid_count = 0 + + for i, r in enumerate(results): + if isinstance(r, Exception): + citation_details.append({ + "citation": citations[i].get("title", f"citation_{i}"), + "error": str(r), + "score": 0.0, + }) + else: + citation_details.append(r) + total_score += r.get("score", 0.0) + valid_count += 1 + + avg_score = total_score / len(citations) if citations else 0.0 + elapsed = time.monotonic() - start + + return CrossCuttingResult( + verifier_name=self.name, + score=round(avg_score, 4), + weight=self.default_weight, + details={ + "citations_checked": len(citations), + "citations_valid": valid_count, + "citation_results": citation_details, + }, + compute_time_seconds=elapsed, + ) + + def _extract_citations(self, task_result: dict) -> list[dict]: + """Extract citation objects from various possible keys/formats.""" + for key in ("citations", "references", "papers", "bibliography"): + raw = task_result.get(key) + if not raw: + continue + if isinstance(raw, list): + return [self._normalize_citation(c) for c in raw if c] + return [] + + @staticmethod + def _normalize_citation(citation: Any) -> dict: + """Normalise a citation to {title, doi, authors, year, claim_text}.""" + if isinstance(citation, str): + return {"title": citation} + if isinstance(citation, dict): + return { + "title": citation.get("title", ""), + "doi": citation.get("doi", ""), + "authors": citation.get("authors", []), + "year": citation.get("year"), + "claim_text": citation.get("claim_text", citation.get("relevance", "")), + "url": citation.get("url", ""), + "abstract": citation.get("abstract", ""), + } + return {"title": str(citation)} + + async def _check_citation(self, citation: dict, domain: str) -> dict: + """Run all 4 component checks on a single citation.""" + component_scores: dict[str, float] = {} + details: dict[str, Any] = {"title": citation.get("title", "")} + + doi = citation.get("doi", "") + if not doi: + doi = self._extract_doi_from_url(citation.get("url", "")) + + # Component 1: DOI resolution (0.30) + if doi: + doi_result = await self._resolve_doi(doi) + component_scores["doi_resolution"] = doi_result["score"] + details["doi"] = doi_result + else: + component_scores["doi_resolution"] = 0.0 + details["doi"] = {"note": "No DOI provided"} + + # Component 2: Metadata match (0.30) + meta_result = await self._check_metadata_match(citation) + component_scores["metadata_match"] = meta_result["score"] + details["metadata"] = meta_result + + # Component 3: Claim support (0.25) + claim_score = self._check_claim_support(citation, meta_result.get("abstract", "")) + component_scores["claim_support"] = claim_score + details["claim_support_score"] = claim_score + + # Component 4: Freshness (0.15) + freshness_score = self._check_freshness(citation, domain) + component_scores["freshness"] = freshness_score + details["freshness_score"] = freshness_score + + weights = { + "doi_resolution": 0.30, + "metadata_match": 0.30, + "claim_support": 0.25, + "freshness": 0.15, + } + + score = sum(weights[k] * component_scores[k] for k in weights) + details["component_scores"] = component_scores + details["score"] = round(score, 4) + + return details + + @staticmethod + def _extract_doi_from_url(url: str) -> str: + """Try to extract a DOI from a URL.""" + match = re.search(r"10\.\d{4,}/[^\s]+", url) + return match.group(0).rstrip(".,;)") if match else "" + + async def _resolve_doi(self, doi: str) -> dict: + """Resolve DOI via CrossRef API.""" + try: + async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as client: + resp = await client.get(f"{CROSSREF_API}/{doi}") + if resp.status_code == 200: + data = resp.json().get("message", {}) + return { + "score": 1.0, + "resolved": True, + "title": data.get("title", [""])[0] if data.get("title") else "", + "doi": doi, + } + return {"score": 0.0, "resolved": False, "status": resp.status_code} + except Exception as e: + logger.warning("doi_resolution_failed", doi=doi, error=str(e)) + return {"score": 0.0, "resolved": False, "error": str(e)} + + async def _check_metadata_match(self, citation: dict) -> dict: + """Check title/author/year match via OpenAlex and Semantic Scholar.""" + title = citation.get("title", "") + if not title: + return {"score": 0.0, "note": "No title to match"} + + # Try OpenAlex first + oa_result = await self._query_openalex(title) + if oa_result["score"] >= 0.7: + return oa_result + + # Fallback to Semantic Scholar + ss_result = await self._query_semantic_scholar(title) + return max([oa_result, ss_result], key=lambda r: r["score"]) + + async def _query_openalex(self, title: str) -> dict: + """Query OpenAlex for title match.""" + try: + async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as client: + resp = await client.get( + OPENALEX_API, + params={"filter": f"title.search:{title[:200]}", "per_page": "1"}, + ) + if resp.status_code != 200: + return {"score": 0.0, "source": "openalex", "error": f"HTTP {resp.status_code}"} + + results = resp.json().get("results", []) + if not results: + return {"score": 0.0, "source": "openalex", "note": "No results"} + + top = results[0] + oa_title = top.get("title", "") + similarity = _jaccard_similarity(title.lower(), oa_title.lower()) + + return { + "score": round(min(1.0, similarity * 1.25), 4), + "source": "openalex", + "matched_title": oa_title, + "similarity": round(similarity, 4), + "abstract": top.get("abstract", "") or "", + "year": top.get("publication_year"), + } + except Exception as e: + logger.warning("openalex_query_failed", error=str(e)) + return {"score": 0.0, "source": "openalex", "error": str(e)} + + async def _query_semantic_scholar(self, title: str) -> dict: + """Query Semantic Scholar for title match.""" + try: + async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as client: + resp = await client.get( + f"{SEMANTIC_SCHOLAR_API}/search", + params={"query": title[:200], "limit": "1", "fields": "title,abstract,year,authors"}, + ) + if resp.status_code != 200: + return {"score": 0.0, "source": "semantic_scholar", "error": f"HTTP {resp.status_code}"} + + data = resp.json().get("data", []) + if not data: + return {"score": 0.0, "source": "semantic_scholar", "note": "No results"} + + top = data[0] + ss_title = top.get("title", "") + similarity = _jaccard_similarity(title.lower(), ss_title.lower()) + + return { + "score": round(min(1.0, similarity * 1.25), 4), + "source": "semantic_scholar", + "matched_title": ss_title, + "similarity": round(similarity, 4), + "abstract": top.get("abstract", "") or "", + "year": top.get("year"), + } + except Exception as e: + logger.warning("semantic_scholar_query_failed", error=str(e)) + return {"score": 0.0, "source": "semantic_scholar", "error": str(e)} + + @staticmethod + def _check_claim_support(citation: dict, fetched_abstract: str) -> float: + """Check if claim text is supported by paper abstract.""" + claim_text = citation.get("claim_text", "") + abstract = fetched_abstract or citation.get("abstract", "") + + if not claim_text or not abstract: + return 0.5 # neutral — can't check + + similarity = _jaccard_similarity(claim_text.lower(), abstract.lower()) + return round(min(1.0, similarity * 2.0), 4) + + @staticmethod + def _check_freshness(citation: dict, domain: str) -> float: + """Penalize old references in fast-moving fields.""" + year = citation.get("year") + if not year or not isinstance(year, (int, float)): + return 0.5 # neutral + + import datetime + current_year = datetime.datetime.now(datetime.timezone.utc).year + age = current_year - int(year) + + if age < 0: + return 0.8 # future year — slight penalty for plausibility + + threshold = ( + FRESHNESS_THRESHOLD_FAST + if domain in FAST_MOVING_DOMAINS + else FRESHNESS_THRESHOLD_SLOW + ) + + if age <= threshold: + return 1.0 + elif age <= threshold * 2: + return round(max(0.3, 1.0 - (age - threshold) / threshold), 4) + return 0.3 + + +def _jaccard_similarity(a: str, b: str) -> float: + """Word-level Jaccard similarity between two strings.""" + words_a = set(a.split()) + words_b = set(b.split()) + if not words_a or not words_b: + return 0.0 + intersection = words_a & words_b + union = words_a | words_b + return len(intersection) / len(union) if union else 0.0 diff --git a/backend/verification/containers/build.sh b/backend/verification/containers/build.sh new file mode 100755 index 0000000..3ba8d57 --- /dev/null +++ b/backend/verification/containers/build.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# =========================================== +# Build ClawdLab verification Docker images +# =========================================== +# Usage: +# ./build.sh all # Build all images +# ./build.sh lean4 # Build Lean 4 + Mathlib image only +# ./build.sh compbio # Build CompBio image only +# ./build.sh coq # Build Coq + MathComp image +# ./build.sh isabelle # Build Isabelle/HOL image +# ./build.sh reproducibility # Build reproducibility sandbox +# ./build.sh ml-inference # Build ML inference image + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +build_lean4() { + echo "==> Building clawdlab/lean4-mathlib ..." + docker build \ + -f "$SCRIPT_DIR/lean4-mathlib.Dockerfile" \ + -t clawdlab/lean4-mathlib:latest \ + "$SCRIPT_DIR" + echo "==> clawdlab/lean4-mathlib:latest built successfully" +} + +build_compbio() { + echo "==> Building clawdlab/compbio-cpu ..." + docker build \ + -f "$SCRIPT_DIR/compbio.Dockerfile" \ + -t clawdlab/compbio-cpu:latest \ + "$SCRIPT_DIR" + echo "==> clawdlab/compbio-cpu:latest built successfully" +} + +build_coq() { + echo "==> Building clawdlab/coq ..." + docker build \ + -f "$SCRIPT_DIR/coq.Dockerfile" \ + -t clawdlab/coq:latest \ + "$SCRIPT_DIR" + echo "==> clawdlab/coq:latest built successfully" +} + +build_isabelle() { + echo "==> Building clawdlab/isabelle ..." + docker build \ + -f "$SCRIPT_DIR/isabelle.Dockerfile" \ + -t clawdlab/isabelle:latest \ + "$SCRIPT_DIR" + echo "==> clawdlab/isabelle:latest built successfully" +} + +build_reproducibility() { + echo "==> Building clawdlab/reproducibility ..." + docker build \ + -f "$SCRIPT_DIR/reproducibility.Dockerfile" \ + -t clawdlab/reproducibility:latest \ + "$SCRIPT_DIR" + echo "==> clawdlab/reproducibility:latest built successfully" +} + +build_ml_inference() { + echo "==> Building clawdlab/ml-inference ..." + docker build \ + -f "$SCRIPT_DIR/ml-inference.Dockerfile" \ + -t clawdlab/ml-inference:latest \ + "$SCRIPT_DIR" + echo "==> clawdlab/ml-inference:latest built successfully" +} + +case "${1:-all}" in + lean4) + build_lean4 + ;; + compbio) + build_compbio + ;; + coq) + build_coq + ;; + isabelle) + build_isabelle + ;; + reproducibility) + build_reproducibility + ;; + ml-inference) + build_ml_inference + ;; + all) + build_compbio + build_lean4 + build_coq + build_isabelle + build_reproducibility + build_ml_inference + ;; + *) + echo "Usage: $0 [lean4|compbio|coq|isabelle|reproducibility|ml-inference|all]" + exit 1 + ;; +esac + +echo "Done." diff --git a/backend/verification/containers/compbio.Dockerfile b/backend/verification/containers/compbio.Dockerfile index f59bcab..9e5640f 100644 --- a/backend/verification/containers/compbio.Dockerfile +++ b/backend/verification/containers/compbio.Dockerfile @@ -1,5 +1,36 @@ +# =========================================== +# ClawdLab — Computational Biology Verification Image +# =========================================== +# Single-stage build: Python 3.11 + BioPython + dssp + scipy +# ~250 MB final image, ~2 min build time. +# +# Usage: docker build -f compbio.Dockerfile -t clawdlab/compbio-cpu . + FROM python:3.11-slim -RUN apt-get update && apt-get install -y --no-install-recommends dssp && \ - rm -rf /var/lib/apt/lists/* && \ - pip install --no-cache-dir biopython numpy + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + dssp \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + biopython \ + numpy \ + scipy + +# Create non-root user +RUN groupadd --gid 1001 verifier \ + && useradd --uid 1001 --gid verifier --shell /bin/bash --create-home verifier + WORKDIR /workspace +RUN chown verifier:verifier /workspace + +USER verifier + +# Default: run a Python script passed as argument +# The adapter writes validation code to /workspace/validate.py then runs: +# docker run --rm -v /tmp/compbio:/workspace clawdlab/compbio-cpu python /workspace/validate.py +CMD ["python", "--version"] diff --git a/backend/verification/containers/coq.Dockerfile b/backend/verification/containers/coq.Dockerfile new file mode 100644 index 0000000..3f28433 --- /dev/null +++ b/backend/verification/containers/coq.Dockerfile @@ -0,0 +1,9 @@ +FROM coqorg/coq:8.18 + +# Install MathComp +RUN opam install -y coq-mathcomp-ssreflect coq-mathcomp-algebra + +# Create non-root verifier user +RUN useradd -m -s /bin/bash verifier +USER verifier +WORKDIR /workspace diff --git a/backend/verification/containers/isabelle.Dockerfile b/backend/verification/containers/isabelle.Dockerfile new file mode 100644 index 0000000..1132003 --- /dev/null +++ b/backend/verification/containers/isabelle.Dockerfile @@ -0,0 +1,11 @@ +FROM makarius/isabelle:Isabelle2024 + +# Pre-build HOL session for faster proofs +RUN isabelle build -b HOL || true + +# Create non-root verifier user +USER root +RUN useradd -m -s /bin/bash verifier && \ + chown -R verifier:verifier /home/verifier +USER verifier +WORKDIR /workspace diff --git a/backend/verification/containers/lean4-mathlib.Dockerfile b/backend/verification/containers/lean4-mathlib.Dockerfile index 809a676..1a3722a 100644 --- a/backend/verification/containers/lean4-mathlib.Dockerfile +++ b/backend/verification/containers/lean4-mathlib.Dockerfile @@ -1,9 +1,62 @@ -FROM ubuntu:22.04 -RUN apt-get update && apt-get install -y curl git build-essential && \ - curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | \ +# =========================================== +# ClawdLab — Lean 4 + Mathlib Verification Image +# =========================================== +# Multi-stage build: builder compiles the toolchain + Mathlib cache, +# runtime is a minimal Ubuntu image (~2 GB final). +# +# Build time: ~30 minutes (Mathlib compilation) +# Usage: docker build -f lean4-mathlib.Dockerfile -t clawdlab/lean4-mathlib . + +# === BUILDER === +FROM ubuntu:22.04 AS builder + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl git build-essential ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install elan (Lean version manager) + Lean 4 v4.3.0 +RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | \ sh -s -- -y --default-toolchain leanprover/lean4:v4.3.0 + ENV PATH="/root/.elan/bin:$PATH" + +# Clone and build Mathlib4 (this takes ~20-30 minutes) RUN git clone --depth 1 https://github.com/leanprover-community/mathlib4.git /opt/mathlib4 && \ - cd /opt/mathlib4 && lake build -ENV MATHLIB_PATH="/opt/mathlib4" + cd /opt/mathlib4 && \ + lake build + +# === RUNTIME === +FROM ubuntu:22.04 AS runtime + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates libgmp10 \ + && rm -rf /var/lib/apt/lists/* + +# Create non-root user +RUN groupadd --gid 1001 verifier \ + && useradd --uid 1001 --gid verifier --shell /bin/bash --create-home verifier + +# Copy Lean toolchain and Mathlib from builder +COPY --from=builder /root/.elan /home/verifier/.elan +COPY --from=builder /opt/mathlib4 /opt/mathlib4 + +# Fix ownership +RUN chown -R verifier:verifier /home/verifier/.elan /opt/mathlib4 + +ENV PATH="/home/verifier/.elan/bin:$PATH" \ + MATHLIB_PATH="/opt/mathlib4" \ + ELAN_HOME="/home/verifier/.elan" + WORKDIR /workspace +RUN chown verifier:verifier /workspace + +USER verifier + +# Default: check a .lean file passed as argument +# The adapter writes proof_code to /workspace/Proof.lean then runs: +# docker run --rm -v /tmp/proof:/workspace clawdlab/lean4-mathlib lean /workspace/Proof.lean +CMD ["lean", "--version"] diff --git a/backend/verification/containers/ml-inference.Dockerfile b/backend/verification/containers/ml-inference.Dockerfile new file mode 100644 index 0000000..68e4aed --- /dev/null +++ b/backend/verification/containers/ml-inference.Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.11-slim + +RUN pip install --no-cache-dir \ + torch --index-url https://download.pytorch.org/whl/cpu && \ + pip install --no-cache-dir \ + transformers datasets accelerate sentencepiece protobuf + +# Create non-root verifier user +RUN useradd -m -s /bin/bash verifier +USER verifier +WORKDIR /workspace diff --git a/backend/verification/containers/reproducibility.Dockerfile b/backend/verification/containers/reproducibility.Dockerfile new file mode 100644 index 0000000..517b604 --- /dev/null +++ b/backend/verification/containers/reproducibility.Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.11-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git make gcc g++ && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + numpy scipy pandas scikit-learn matplotlib seaborn \ + jupyter pyyaml toml + +# Create non-root verifier user +RUN useradd -m -s /bin/bash verifier +USER verifier +WORKDIR /workspace diff --git a/backend/verification/cross_cutting_base.py b/backend/verification/cross_cutting_base.py new file mode 100644 index 0000000..8928f4a --- /dev/null +++ b/backend/verification/cross_cutting_base.py @@ -0,0 +1,37 @@ +"""Base class for cross-cutting verifiers that apply to any domain.""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class CrossCuttingResult: + """Result from a single cross-cutting verifier.""" + verifier_name: str + score: float # 0.0-1.0 + weight: float # contribution to final merged score + details: dict[str, Any] = field(default_factory=dict) + errors: list[str] = field(default_factory=list) + warnings: list[str] = field(default_factory=list) + compute_time_seconds: float = 0.0 + + +class CrossCuttingVerifier: + """Abstract base for verifiers that enhance every domain adapter. + + Unlike domain adapters (which run for a single domain), cross-cutting + verifiers apply to any task result that has the relevant data + (citations, statistical claims, code repos, raw data, etc.). + """ + name: str = "" + default_weight: float = 0.10 + requires_docker: bool = False + + def is_applicable(self, task_result: dict, task_metadata: dict) -> bool: + """Return True if this verifier should run on this job.""" + raise NotImplementedError + + async def verify(self, task_result: dict, task_metadata: dict) -> CrossCuttingResult: + """Run verification and return result.""" + raise NotImplementedError diff --git a/backend/verification/cross_cutting_runner.py b/backend/verification/cross_cutting_runner.py new file mode 100644 index 0000000..3b35f0a --- /dev/null +++ b/backend/verification/cross_cutting_runner.py @@ -0,0 +1,180 @@ +"""Orchestrator for cross-cutting verifiers. + +Collects all registered CrossCuttingVerifier instances, filters to +applicable ones, runs them concurrently, and merges results with +the domain adapter result using weighted scoring. +""" +from __future__ import annotations + +import asyncio +import time +from typing import Any + +from backend.logging_config import get_logger +from backend.verification.base import VerificationResult +from backend.verification.cross_cutting_base import ( + CrossCuttingResult, + CrossCuttingVerifier, +) + +logger = get_logger(__name__) + +# Registry of all cross-cutting verifiers +_CC_VERIFIERS: list[CrossCuttingVerifier] = [] + + +def register_cross_cutting(verifier: CrossCuttingVerifier) -> None: + """Register a cross-cutting verifier.""" + _CC_VERIFIERS.append(verifier) + logger.info("cross_cutting_registered", name=verifier.name) + + +def get_cross_cutting_verifiers() -> list[CrossCuttingVerifier]: + """Return all registered cross-cutting verifiers.""" + return list(_CC_VERIFIERS) + + +async def _run_single( + verifier: CrossCuttingVerifier, + task_result: dict, + task_metadata: dict, +) -> CrossCuttingResult: + """Run a single cross-cutting verifier with error handling.""" + start = time.monotonic() + try: + result = await verifier.verify(task_result, task_metadata) + result.compute_time_seconds = time.monotonic() - start + return result + except Exception as e: + elapsed = time.monotonic() - start + logger.exception("cross_cutting_verifier_failed", name=verifier.name) + return CrossCuttingResult( + verifier_name=verifier.name, + score=0.0, + weight=verifier.default_weight, + errors=[f"Verifier crashed: {str(e)}"], + compute_time_seconds=elapsed, + ) + + +async def run_cross_cutting( + task_result: dict, + task_metadata: dict, +) -> list[CrossCuttingResult]: + """Run all applicable cross-cutting verifiers concurrently.""" + applicable = [ + v for v in _CC_VERIFIERS + if _safe_is_applicable(v, task_result, task_metadata) + ] + + if not applicable: + return [] + + logger.info( + "cross_cutting_running", + count=len(applicable), + names=[v.name for v in applicable], + ) + + results = await asyncio.gather( + *[_run_single(v, task_result, task_metadata) for v in applicable] + ) + + return list(results) + + +def _safe_is_applicable( + verifier: CrossCuttingVerifier, + task_result: dict, + task_metadata: dict, +) -> bool: + """Check applicability without crashing.""" + try: + return verifier.is_applicable(task_result, task_metadata) + except Exception: + logger.exception("cross_cutting_applicability_error", name=verifier.name) + return False + + +def merge_results( + domain_result: VerificationResult, + cc_results: list[CrossCuttingResult], + domain_weight: float = 0.70, +) -> VerificationResult: + """Merge domain adapter result with cross-cutting verifier results. + + Domain adapter gets ``domain_weight`` (default 70%) of the final score. + Cross-cutting verifiers share the remaining ``1 - domain_weight`` (30%), + distributed proportionally to their individual weights. + """ + if not cc_results: + return domain_result + + # Normalise cross-cutting weights so they sum to 1.0 + total_cc_weight = sum(r.weight for r in cc_results) + if total_cc_weight <= 0: + return domain_result + + cc_weight_share = 1.0 - domain_weight + + cc_score = sum( + (r.weight / total_cc_weight) * r.score for r in cc_results + ) + + final_score = domain_weight * domain_result.score + cc_weight_share * cc_score + final_score = min(1.0, round(final_score, 4)) + + # Merge warnings, errors, and details + all_warnings = list(domain_result.warnings) + all_errors = list(domain_result.errors) + cc_details: list[dict[str, Any]] = [] + + for r in cc_results: + all_warnings.extend(r.warnings) + all_errors.extend(r.errors) + cc_details.append({ + "verifier": r.verifier_name, + "score": r.score, + "weight": r.weight, + "details": r.details, + "errors": r.errors, + "warnings": r.warnings, + "compute_time_seconds": r.compute_time_seconds, + }) + + merged_details = dict(domain_result.details) + merged_details["cross_cutting"] = cc_details + merged_details["scoring"] = { + "domain_score": domain_result.score, + "domain_weight": domain_weight, + "cc_aggregate_score": round(cc_score, 4), + "cc_weight_share": cc_weight_share, + "final_score": final_score, + } + + return VerificationResult( + passed=final_score >= 0.5, + score=final_score, + badge=VerificationResult.score_to_badge(final_score), + domain=domain_result.domain, + details=merged_details, + errors=all_errors, + warnings=all_warnings, + compute_time_seconds=domain_result.compute_time_seconds + sum( + r.compute_time_seconds for r in cc_results + ), + ) + + +def _register_all_cross_cutting() -> None: + """Import and register all cross-cutting verifiers. Called once at startup.""" + from backend.verification.citation_verifier import CitationVerifier + from backend.verification.statistical_forensics import StatisticalForensicsVerifier + from backend.verification.reproducibility_executor import ReproducibilityExecutor + from backend.verification.data_integrity import DataIntegrityVerifier + + for cls in [CitationVerifier, StatisticalForensicsVerifier, ReproducibilityExecutor, DataIntegrityVerifier]: + register_cross_cutting(cls()) + + +_register_all_cross_cutting() diff --git a/backend/verification/data_integrity.py b/backend/verification/data_integrity.py new file mode 100644 index 0000000..06b1aef --- /dev/null +++ b/backend/verification/data_integrity.py @@ -0,0 +1,308 @@ +"""Cross-cutting verifier: Data Integrity Checks. + +Validates data quality via schema validation, duplicate detection, +outlier flagging, and hash verification. +""" +from __future__ import annotations + +import asyncio +import hashlib +import math +import time +from typing import Any + +from backend.logging_config import get_logger +from backend.verification.cross_cutting_base import ( + CrossCuttingResult, + CrossCuttingVerifier, +) + +logger = get_logger(__name__) + + +class DataIntegrityVerifier(CrossCuttingVerifier): + name = "data_integrity" + default_weight = 0.10 + + def is_applicable(self, task_result: dict, task_metadata: dict) -> bool: + data_keys = {"data", "dataset", "raw_data", "results_summary", "output_checksums"} + return any(k in task_result and task_result[k] for k in data_keys) + + async def verify(self, task_result: dict, task_metadata: dict) -> CrossCuttingResult: + start = time.monotonic() + + component_scores: dict[str, float] = {} + details: dict[str, Any] = {} + warnings: list[str] = [] + + data = self._extract_data(task_result) + checksums = task_result.get("output_checksums", {}) + schema_def = task_result.get("schema") or task_result.get("expected_schema") + + # Run all checks concurrently via threads + schema_task = asyncio.to_thread(self._check_schema, data, schema_def) if data else _neutral("No data for schema check") + dup_task = asyncio.to_thread(self._check_duplicates, data) if data else _neutral("No data for duplicate check") + outlier_task = asyncio.to_thread(self._check_outliers, data) if data else _neutral("No data for outlier check") + hash_task = asyncio.to_thread(self._check_hashes, task_result, checksums) if checksums else _neutral("No checksums") + + schema_result, dup_result, outlier_result, hash_result = await asyncio.gather( + schema_task, dup_task, outlier_task, hash_task, + ) + + for name, result in [("schema_valid", schema_result), ("no_duplicates", dup_result), + ("no_outliers", outlier_result), ("hash_match", hash_result)]: + component_scores[name] = result.get("score", 0.5) + details[name] = result + if result.get("warnings"): + warnings.extend(result["warnings"]) + + # Equal weights for all 4 components + applicable = [k for k in component_scores if details[k].get("applicable", True)] + if applicable: + score = sum(component_scores[k] for k in applicable) / len(applicable) + else: + score = 0.5 + + elapsed = time.monotonic() - start + details["component_scores"] = component_scores + + return CrossCuttingResult( + verifier_name=self.name, + score=round(score, 4), + weight=self.default_weight, + details=details, + warnings=warnings, + compute_time_seconds=elapsed, + ) + + @staticmethod + def _extract_data(task_result: dict) -> list[dict] | None: + """Extract tabular data from task result.""" + for key in ("data", "dataset", "raw_data"): + raw = task_result.get(key) + if isinstance(raw, list) and raw and isinstance(raw[0], dict): + return raw + if isinstance(raw, dict): + # Single-row or nested data — try to extract rows + if "rows" in raw and isinstance(raw["rows"], list): + return raw["rows"] + if "records" in raw and isinstance(raw["records"], list): + return raw["records"] + + # results_summary may contain numeric data + summary = task_result.get("results_summary") + if isinstance(summary, dict): + # Convert summary to single-row dataset for outlier check + numeric_vals = {k: v for k, v in summary.items() if isinstance(v, (int, float))} + if numeric_vals: + return [numeric_vals] + + return None + + @staticmethod + def _check_schema(data: list[dict], schema_def: dict | None) -> dict: + """Validate data structure against declared schema.""" + if not data: + return {"score": 0.5, "applicable": False, "note": "No data"} + + # If explicit schema is provided, validate against it + if schema_def and isinstance(schema_def, dict): + expected_fields = set(schema_def.get("fields", schema_def.get("columns", []))) + if expected_fields: + actual_fields = set(data[0].keys()) if data else set() + missing = expected_fields - actual_fields + extra = actual_fields - expected_fields + coverage = len(expected_fields - missing) / len(expected_fields) if expected_fields else 0 + return { + "score": round(coverage, 4), + "applicable": True, + "expected_fields": sorted(expected_fields), + "missing_fields": sorted(missing), + "extra_fields": sorted(extra), + } + + # Basic structural consistency check: all rows have same keys + if len(data) < 2: + return {"score": 1.0, "applicable": True, "note": "Single row, schema consistent"} + + ref_keys = set(data[0].keys()) + inconsistent = 0 + for i, row in enumerate(data[1:], 1): + if set(row.keys()) != ref_keys: + inconsistent += 1 + if inconsistent >= 5: + break + + score = 1.0 - (inconsistent / min(len(data) - 1, 100)) + return { + "score": round(max(0.0, score), 4), + "applicable": True, + "total_rows": len(data), + "inconsistent_rows": inconsistent, + "columns": sorted(ref_keys), + } + + @staticmethod + def _check_duplicates(data: list[dict]) -> dict: + """Detect exact and near-duplicate rows.""" + if not data or len(data) < 2: + return {"score": 1.0, "applicable": True, "duplicates": 0} + + seen: set[str] = set() + exact_dupes = 0 + + for row in data: + key = str(sorted(row.items())) + if key in seen: + exact_dupes += 1 + else: + seen.add(key) + + dup_ratio = exact_dupes / len(data) if data else 0 + + if dup_ratio > 0.5: + score = 0.1 + elif dup_ratio > 0.2: + score = 0.4 + elif dup_ratio > 0.05: + score = 0.7 + else: + score = 1.0 + + return { + "score": round(score, 4), + "applicable": True, + "total_rows": len(data), + "exact_duplicates": exact_dupes, + "duplicate_ratio": round(dup_ratio, 4), + "warnings": [f"{exact_dupes} exact duplicate rows detected"] if exact_dupes > 0 else [], + } + + @staticmethod + def _check_outliers(data: list[dict]) -> dict: + """Detect anomalous outliers via z-score (>3 sigma).""" + if not data: + return {"score": 0.5, "applicable": False, "note": "No data"} + + # Collect numeric columns + numeric_cols: dict[str, list[float]] = {} + for row in data: + for k, v in row.items(): + if isinstance(v, (int, float)) and not isinstance(v, bool) and math.isfinite(v): + numeric_cols.setdefault(k, []).append(float(v)) + + if not numeric_cols: + return {"score": 0.5, "applicable": False, "note": "No numeric columns"} + + outlier_counts: dict[str, int] = {} + total_values = 0 + total_outliers = 0 + + for col, values in numeric_cols.items(): + if len(values) < 5: + continue + + total_values += len(values) + mean = sum(values) / len(values) + variance = sum((x - mean) ** 2 for x in values) / len(values) + std = math.sqrt(variance) if variance > 0 else 0 + + if std == 0: + continue + + n_outliers = sum(1 for x in values if abs((x - mean) / std) > 3.0) + if n_outliers > 0: + outlier_counts[col] = n_outliers + total_outliers += n_outliers + + if total_values == 0: + return {"score": 0.5, "applicable": False, "note": "Insufficient numeric data"} + + outlier_ratio = total_outliers / total_values + # Expect ~0.3% outliers under normal distribution + if outlier_ratio > 0.10: + score = 0.2 + elif outlier_ratio > 0.05: + score = 0.5 + elif outlier_ratio > 0.01: + score = 0.8 + else: + score = 1.0 + + return { + "score": round(score, 4), + "applicable": True, + "columns_checked": len(numeric_cols), + "total_values": total_values, + "total_outliers": total_outliers, + "outlier_ratio": round(outlier_ratio, 6), + "outlier_columns": outlier_counts, + "warnings": [f"High outlier ratio ({outlier_ratio:.1%}) in columns: {list(outlier_counts.keys())}"] + if outlier_ratio > 0.05 else [], + } + + @staticmethod + def _check_hashes(task_result: dict, checksums: dict) -> dict: + """Verify SHA-256 hashes of data blobs.""" + if not checksums: + return {"score": 0.5, "applicable": False, "note": "No checksums"} + + matches = 0 + mismatches = 0 + checks: list[dict] = [] + + for key, expected_hash in checksums.items(): + data_blob = task_result.get(key) + if data_blob is None: + # Try nested data + for container_key in ("data", "raw_data", "dataset"): + container = task_result.get(container_key) + if isinstance(container, dict) and key in container: + data_blob = container[key] + break + + if data_blob is None: + checks.append({"key": key, "match": False, "note": "Data not found"}) + mismatches += 1 + continue + + if isinstance(data_blob, (dict, list)): + serialised = _canonical_json(data_blob) + else: + serialised = str(data_blob) + + actual_hash = hashlib.sha256(serialised.encode()).hexdigest() + match = actual_hash == expected_hash + checks.append({ + "key": key, + "match": match, + "expected": expected_hash[:16] + "...", + "actual": actual_hash[:16] + "...", + }) + + if match: + matches += 1 + else: + mismatches += 1 + + total = matches + mismatches + score = matches / total if total > 0 else 0.5 + + return { + "score": round(score, 4), + "applicable": True, + "matches": matches, + "mismatches": mismatches, + "checks": checks, + "warnings": [f"{mismatches} hash mismatch(es)"] if mismatches > 0 else [], + } + + +def _canonical_json(obj: Any) -> str: + """Produce a canonical JSON string for hashing.""" + import json + return json.dumps(obj, sort_keys=True, separators=(",", ":"), default=str) + + +async def _neutral(note: str) -> dict: + return {"score": 0.5, "applicable": False, "note": note} diff --git a/backend/verification/dispatcher.py b/backend/verification/dispatcher.py index 51d6202..7c7086f 100644 --- a/backend/verification/dispatcher.py +++ b/backend/verification/dispatcher.py @@ -6,6 +6,15 @@ logger = get_logger(__name__) +# Domains that require Docker containers for verification +DOCKER_DOMAINS: set[str] = {"mathematics", "computational_biology"} + + +def is_docker_domain(domain: str) -> bool: + """Return True if verification for this domain runs in a Docker container.""" + return domain in DOCKER_DOMAINS + + # Registry — populated at import time _ADAPTERS: dict[str, VerificationAdapter] = {} @@ -51,8 +60,11 @@ def _register_all() -> None: from backend.verification.compbio_adapter import CompBioAdapter from backend.verification.materials_adapter import MaterialsAdapter from backend.verification.bioinfo_adapter import BioInfoAdapter + from backend.verification.chemistry_adapter import ChemistryAdapter + from backend.verification.physics_adapter import PhysicsAdapter - for cls in [Lean4Adapter, MLReproAdapter, CompBioAdapter, MaterialsAdapter, BioInfoAdapter]: + for cls in [Lean4Adapter, MLReproAdapter, CompBioAdapter, MaterialsAdapter, + BioInfoAdapter, ChemistryAdapter, PhysicsAdapter]: register_adapter(cls()) diff --git a/backend/verification/lean4_adapter.py b/backend/verification/lean4_adapter.py index 6d24660..3d9c71b 100644 --- a/backend/verification/lean4_adapter.py +++ b/backend/verification/lean4_adapter.py @@ -15,6 +15,12 @@ LEAN4_IMAGE = "clawdlab/lean4-mathlib:latest" LEAN4_TIMEOUT = 300 # 5 min max +COQ_IMAGE = "clawdlab/coq:latest" +COQ_TIMEOUT = 300 + +ISABELLE_IMAGE = "clawdlab/isabelle:latest" +ISABELLE_TIMEOUT = 300 + class Lean4Adapter(VerificationAdapter): domain = "mathematics" @@ -25,9 +31,17 @@ async def verify(self, task_result: dict, task_metadata: dict) -> VerificationRe return VerificationResult.fail(self.domain, ["No proof_code in result"]) claim_type = task_result.get("claim_type", "theorem") + proof_system = task_result.get("proof_system", "lean4") dependencies = task_result.get("dependencies", []) statement = task_result.get("statement") + # Route to proof system + if proof_system == "coq": + return await self._verify_coq(task_result) + elif proof_system == "isabelle": + return await self._verify_isabelle(task_result) + + # Default: Lean 4 if claim_type == "theorem": return await self._verify_theorem(proof_code, dependencies, statement) elif claim_type == "conjecture": @@ -168,3 +182,155 @@ def _parse_lean_metrics(self, stdout: str, stderr: str, code: str) -> dict: "tactics_used": tactics_used, "tactic_count": sum(tactics_used.values()), } + + # ------------------------------------------------------------------ + # Coq verification + # ------------------------------------------------------------------ + + async def _verify_coq(self, task_result: dict) -> VerificationResult: + """Verify proof using Coq in Docker sandbox.""" + start = time.monotonic() + + proof_code = task_result.get("proof_code", "") + statement = task_result.get("statement") + dependencies = task_result.get("dependencies", []) + + # Build .v file + imports = "\n".join(f"Require Import {dep}." for dep in dependencies) if dependencies else "" + full_code = f"{imports}\n\n{proof_code}" if imports else proof_code + + with tempfile.TemporaryDirectory() as tmpdir: + proof_path = Path(tmpdir) / "Proof.v" + proof_path.write_text(full_code) + + cmd = [ + "docker", "run", "--rm", + "--network=none", + "--memory=4g", + "--cpus=2", + "-v", f"{tmpdir}:/workspace:ro", + "-w", "/workspace", + COQ_IMAGE, + "coqc", "Proof.v", + ] + + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for( + proc.communicate(), timeout=COQ_TIMEOUT, + ) + except asyncio.TimeoutError: + return VerificationResult.fail( + self.domain, ["Coq compilation timed out (5 min limit)"], + ) + + elapsed = time.monotonic() - start + stderr_text = stderr.decode(errors="replace") + + if proc.returncode == 0: + return VerificationResult( + passed=True, + score=1.0, + badge=VerificationBadge.GREEN, + domain=self.domain, + details={ + "compiler": "coq", + "compile_time_seconds": round(elapsed, 2), + "statement": statement, + }, + compute_time_seconds=elapsed, + ) + else: + errors = [line for line in stderr_text.splitlines() if "Error" in line][:10] + return VerificationResult( + passed=False, + score=0.0, + badge=VerificationBadge.RED, + domain=self.domain, + errors=errors or [stderr_text[:500]], + details={"compiler": "coq", "compiler_output": stderr_text[:2000]}, + compute_time_seconds=elapsed, + ) + + # ------------------------------------------------------------------ + # Isabelle verification + # ------------------------------------------------------------------ + + async def _verify_isabelle(self, task_result: dict) -> VerificationResult: + """Verify proof using Isabelle/HOL in Docker sandbox.""" + start = time.monotonic() + + proof_code = task_result.get("proof_code", "") + statement = task_result.get("statement") + theory_name = task_result.get("theory_name", "Proof") + + # Build .thy file + full_code = f'theory {theory_name}\nimports Main\nbegin\n\n{proof_code}\n\nend' + + with tempfile.TemporaryDirectory() as tmpdir: + thy_path = Path(tmpdir) / f"{theory_name}.thy" + thy_path.write_text(full_code) + + # Write ROOT file for isabelle build + root_path = Path(tmpdir) / "ROOT" + root_path.write_text(f'session "{theory_name}" = HOL +\n theories {theory_name}\n') + + cmd = [ + "docker", "run", "--rm", + "--network=none", + "--memory=4g", + "--cpus=2", + "-v", f"{tmpdir}:/workspace:ro", + "-w", "/workspace", + ISABELLE_IMAGE, + "isabelle", "build", "-D", ".", + ] + + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for( + proc.communicate(), timeout=ISABELLE_TIMEOUT, + ) + except asyncio.TimeoutError: + return VerificationResult.fail( + self.domain, ["Isabelle build timed out (5 min limit)"], + ) + + elapsed = time.monotonic() - start + stdout_text = stdout.decode(errors="replace") + stderr_text = stderr.decode(errors="replace") + + if proc.returncode == 0: + return VerificationResult( + passed=True, + score=1.0, + badge=VerificationBadge.GREEN, + domain=self.domain, + details={ + "compiler": "isabelle", + "compile_time_seconds": round(elapsed, 2), + "statement": statement, + "theory_name": theory_name, + }, + compute_time_seconds=elapsed, + ) + else: + combined = f"{stdout_text}\n{stderr_text}" + errors = [line for line in combined.splitlines() if "Error" in line or "***" in line][:10] + return VerificationResult( + passed=False, + score=0.0, + badge=VerificationBadge.RED, + domain=self.domain, + errors=errors or [combined[:500]], + details={"compiler": "isabelle", "compiler_output": combined[:2000]}, + compute_time_seconds=elapsed, + ) diff --git a/backend/verification/ml_repro_adapter.py b/backend/verification/ml_repro_adapter.py index fdf790d..c09d3de 100644 --- a/backend/verification/ml_repro_adapter.py +++ b/backend/verification/ml_repro_adapter.py @@ -22,6 +22,8 @@ "/resolve/main/default/train-00000-of-00001.parquet" ) TIMEOUT = 30 +ML_INFERENCE_IMAGE = "clawdlab/ml-inference:latest" +ML_INFERENCE_TIMEOUT = 600 # 10 min SUPPORTED_BENCHMARKS = { "mmlu", "hellaswag", "arc_easy", "arc_challenge", "winogrande", @@ -41,6 +43,8 @@ async def verify(self, task_result: dict, task_metadata: dict) -> VerificationRe if claim_type == "benchmark_result": return await self._verify_benchmark(task_result) + elif claim_type == "benchmark_live": + return await self._verify_benchmark_live(task_result) elif claim_type == "ml_experiment": return await self._verify_experiment(task_result) elif claim_type == "architecture": @@ -48,6 +52,10 @@ async def verify(self, task_result: dict, task_metadata: dict) -> VerificationRe else: return VerificationResult.fail(self.domain, [f"Unknown claim_type: {claim_type}"]) + def requires_docker_for(self, task_result: dict) -> bool: + """Return True if this specific claim type requires Docker.""" + return task_result.get("claim_type") == "benchmark_live" + # ------------------------------------------------------------------ # benchmark_result # ------------------------------------------------------------------ @@ -142,6 +150,250 @@ async def _verify_benchmark(self, result: dict) -> VerificationResult: compute_time_seconds=elapsed, ) + # ------------------------------------------------------------------ + # benchmark_live — Docker-based live inference + # ------------------------------------------------------------------ + + async def _verify_benchmark_live(self, result: dict) -> VerificationResult: + """Run live inference in Docker sandbox and compare to claimed metrics.""" + start = time.monotonic() + + model_id = result.get("model_id") + benchmark = result.get("benchmark", "").lower() + claimed_metrics = result.get("metrics", {}) + sample_size = min(result.get("sample_size", 20), 50) + + if not model_id: + return VerificationResult.fail(self.domain, ["No model_id provided"]) + if not benchmark: + return VerificationResult.fail(self.domain, ["No benchmark specified for live inference"]) + + component_scores: dict[str, float] = {} + details: dict = { + "claim_type": "benchmark_live", + "model_id": model_id, + "benchmark": benchmark, + "sample_size": sample_size, + } + warnings: list[str] = [] + + # Build inference script + script = self._build_inference_script(model_id, benchmark, sample_size) + + import tempfile + from pathlib import Path + + with tempfile.TemporaryDirectory() as tmpdir: + script_path = Path(tmpdir) / "run_inference.py" + script_path.write_text(script) + + cmd = [ + "docker", "run", "--rm", + "--network=host", # Needs network to download model from HF + "--memory=4g", + "--cpus=2", + "-v", f"{tmpdir}:/workspace", + "-w", "/workspace", + ML_INFERENCE_IMAGE, + "python", "run_inference.py", + ] + + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for( + proc.communicate(), timeout=ML_INFERENCE_TIMEOUT, + ) + except asyncio.TimeoutError: + return VerificationResult.fail( + self.domain, + [f"Live inference timed out ({ML_INFERENCE_TIMEOUT}s)"], + ) + except FileNotFoundError: + return VerificationResult.fail( + self.domain, ["Docker not available for live inference"], + ) + + stdout_text = stdout.decode(errors="replace") + stderr_text = stderr.decode(errors="replace") + + # Parse JSON output from script + import json + try: + inference_results = json.loads(stdout_text) + except (json.JSONDecodeError, ValueError): + # Component 1: Model loadable — failed + component_scores["model_loadable"] = 0.0 + details["error"] = stderr_text[:1000] + elapsed = time.monotonic() - start + return VerificationResult( + passed=False, + score=0.0, + badge=VerificationResult.score_to_badge(0.0), + domain=self.domain, + details=details, + errors=["Inference script failed to produce valid JSON output"], + compute_time_seconds=elapsed, + ) + + # Component 1: Model loadable (0.20) + model_loaded = inference_results.get("model_loaded", False) + component_scores["model_loadable"] = 1.0 if model_loaded else 0.0 + details["model_loaded"] = model_loaded + + if not model_loaded: + elapsed = time.monotonic() - start + details["component_scores"] = component_scores + return VerificationResult( + passed=False, + score=0.0, + badge=VerificationResult.score_to_badge(0.0), + domain=self.domain, + details=details, + errors=[inference_results.get("error", "Model failed to load")], + compute_time_seconds=elapsed, + ) + + # Component 2: Inference runs (0.30) + total_samples = inference_results.get("total_samples", 0) + successful_samples = inference_results.get("successful_samples", 0) + inference_score = successful_samples / max(total_samples, 1) + component_scores["inference_runs"] = round(inference_score, 4) + details["inference"] = { + "total": total_samples, + "successful": successful_samples, + } + + # Component 3: Accuracy match (0.35) + live_metrics = inference_results.get("metrics", {}) + if live_metrics and claimed_metrics: + matches = 0 + comparisons: dict = {} + for metric_name, claimed_val in claimed_metrics.items(): + if not isinstance(claimed_val, (int, float)): + continue + live_val = live_metrics.get(metric_name) + if live_val is None: + continue + tolerance = max(abs(claimed_val) * 0.05, 0.01) + match = abs(claimed_val - live_val) <= tolerance + comparisons[metric_name] = { + "claimed": claimed_val, + "live": live_val, + "tolerance": tolerance, + "match": match, + } + if match: + matches += 1 + + if comparisons: + component_scores["accuracy_match"] = round(matches / len(comparisons), 4) + else: + component_scores["accuracy_match"] = 0.3 + warnings.append("No comparable metrics between claimed and live results") + details["accuracy_comparisons"] = comparisons + else: + component_scores["accuracy_match"] = 0.0 + details["accuracy_comparisons"] = {} + + # Component 4: Latency reasonable (0.15) + avg_latency = inference_results.get("avg_latency_seconds", 0) + if avg_latency > 0 and avg_latency < 60: + component_scores["latency"] = 1.0 + elif avg_latency < 120: + component_scores["latency"] = 0.5 + else: + component_scores["latency"] = 0.2 + details["avg_latency_seconds"] = avg_latency + + weights = { + "model_loadable": 0.20, + "inference_runs": 0.30, + "accuracy_match": 0.35, + "latency": 0.15, + } + score = sum(weights.get(k, 0) * component_scores.get(k, 0.0) for k in weights) + score = min(1.0, round(score, 4)) + + elapsed = time.monotonic() - start + details["component_scores"] = component_scores + + return VerificationResult( + passed=score >= 0.5, + score=score, + badge=VerificationResult.score_to_badge(score), + domain=self.domain, + details=details, + warnings=warnings, + compute_time_seconds=elapsed, + ) + + @staticmethod + def _build_inference_script( + model_id: str, benchmark: str, sample_size: int, + ) -> str: + """Generate a Python script for Docker-based live inference.""" + return f'''#!/usr/bin/env python3 +"""Auto-generated inference script for live benchmark verification.""" +import json +import time +import sys + +results = {{ + "model_loaded": False, + "total_samples": 0, + "successful_samples": 0, + "metrics": {{}}, + "avg_latency_seconds": 0, +}} + +try: + from transformers import AutoModelForCausalLM, AutoTokenizer + import torch + + model_id = "{model_id}" + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + model_id, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True, + ) + model.eval() + results["model_loaded"] = True + + # Simple text generation benchmark + sample_size = {sample_size} + prompts = [f"Question {{i}}: What is {{i}} + {{i}}?" for i in range(sample_size)] + + latencies = [] + successful = 0 + + for prompt in prompts: + try: + start = time.monotonic() + inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128) + with torch.no_grad(): + outputs = model.generate(**inputs, max_new_tokens=32, do_sample=False) + elapsed = time.monotonic() - start + latencies.append(elapsed) + successful += 1 + except Exception: + pass + + results["total_samples"] = sample_size + results["successful_samples"] = successful + results["avg_latency_seconds"] = round(sum(latencies) / max(len(latencies), 1), 3) + results["metrics"] = {{ + "inference_success_rate": round(successful / sample_size, 4) if sample_size > 0 else 0, + }} + +except Exception as e: + results["error"] = str(e) + +print(json.dumps(results)) +''' + # ------------------------------------------------------------------ # ml_experiment — git-based provenance # ------------------------------------------------------------------ diff --git a/backend/verification/physics_adapter.py b/backend/verification/physics_adapter.py new file mode 100644 index 0000000..33a3953 --- /dev/null +++ b/backend/verification/physics_adapter.py @@ -0,0 +1,600 @@ +"""Physics verification: conservation laws, dimensional analysis, symbolic math. + +CPU-only (no Docker) — pint and sympy via asyncio.to_thread(). +""" +from __future__ import annotations + +import asyncio +import math +import time +from typing import Any + +from backend.logging_config import get_logger +from backend.verification.base import ( + VerificationAdapter, + VerificationBadge, + VerificationResult, +) + +logger = get_logger(__name__) + +# Graceful imports +try: + import pint + PINT_AVAILABLE = True + _ureg = pint.UnitRegistry() +except ImportError: + PINT_AVAILABLE = False + _ureg = None + logger.warning("pint_not_available") + +try: + import sympy + from sympy.parsing.sympy_parser import parse_expr, standard_transformations, implicit_multiplication_application + SYMPY_AVAILABLE = True +except ImportError: + SYMPY_AVAILABLE = False + logger.warning("sympy_not_available") + + +class PhysicsAdapter(VerificationAdapter): + domain = "physics" + + async def verify(self, task_result: dict, task_metadata: dict) -> VerificationResult: + claim_type = task_result.get("claim_type", "numerical_simulation") + + if claim_type == "numerical_simulation": + return await self._verify_simulation(task_result) + elif claim_type == "analytical_derivation": + return await self._verify_derivation(task_result) + elif claim_type == "dimensional_analysis": + return await self._verify_dimensions(task_result) + else: + return VerificationResult.fail(self.domain, [f"Unknown claim_type: {claim_type}"]) + + # ------------------------------------------------------------------ + # numerical_simulation + # ------------------------------------------------------------------ + + async def _verify_simulation(self, result: dict) -> VerificationResult: + start = time.monotonic() + + sim_data = result.get("simulation_data", {}) + conservation = result.get("conservation_quantities", {}) + + if not sim_data and not conservation: + return VerificationResult.fail(self.domain, ["No simulation_data or conservation_quantities"]) + + component_scores: dict[str, float] = {} + details: dict[str, Any] = {"claim_type": "numerical_simulation"} + warnings: list[str] = [] + + # Component 1: Conservation laws (0.35) + conserv_result = await asyncio.to_thread( + self._check_conservation, conservation, sim_data, + ) + component_scores["conservation_laws"] = conserv_result["score"] + details["conservation"] = conserv_result + if conserv_result.get("warnings"): + warnings.extend(conserv_result["warnings"]) + + # Component 2: Stability (0.25) + stability_result = await asyncio.to_thread(self._check_stability, sim_data) + component_scores["stability"] = stability_result["score"] + details["stability"] = stability_result + + # Component 3: Convergence (0.25) + convergence_result = await asyncio.to_thread( + self._check_convergence, sim_data, + ) + component_scores["convergence"] = convergence_result["score"] + details["convergence"] = convergence_result + + # Component 4: Boundary conditions (0.15) + boundary_result = await asyncio.to_thread( + self._check_boundary_conditions, sim_data, + ) + component_scores["boundary_conditions"] = boundary_result["score"] + details["boundary_conditions"] = boundary_result + + weights = { + "conservation_laws": 0.35, + "stability": 0.25, + "convergence": 0.25, + "boundary_conditions": 0.15, + } + score = sum(weights[k] * component_scores.get(k, 0.0) for k in weights) + score = min(1.0, round(score, 4)) + + elapsed = time.monotonic() - start + details["component_scores"] = component_scores + + return VerificationResult( + passed=score >= 0.5, + score=score, + badge=VerificationResult.score_to_badge(score), + domain=self.domain, + details=details, + warnings=warnings, + compute_time_seconds=elapsed, + ) + + # ------------------------------------------------------------------ + # analytical_derivation + # ------------------------------------------------------------------ + + async def _verify_derivation(self, result: dict) -> VerificationResult: + start = time.monotonic() + + expression = result.get("expression") + units = result.get("units", {}) + lhs = result.get("lhs") + rhs = result.get("rhs") + + if not expression and not (lhs and rhs): + return VerificationResult.fail( + self.domain, ["No expression or lhs/rhs provided"], + ) + + component_scores: dict[str, float] = {} + details: dict[str, Any] = {"claim_type": "analytical_derivation"} + warnings: list[str] = [] + + # Component 1: Dimensional consistency (0.40) + dim_result = await asyncio.to_thread( + self._check_dimensional_consistency, expression or f"({lhs}) - ({rhs})", units, + ) + component_scores["dimensional_consistency"] = dim_result["score"] + details["dimensional_consistency"] = dim_result + + # Component 2: Symbolic validity (0.30) + sym_result = await asyncio.to_thread( + self._check_symbolic_validity, expression, lhs, rhs, + ) + component_scores["symbolic_validity"] = sym_result["score"] + details["symbolic_validity"] = sym_result + + # Component 3: Unit consistency (0.30) + unit_result = await asyncio.to_thread( + self._check_unit_consistency, units, + ) + component_scores["unit_consistency"] = unit_result["score"] + details["unit_consistency"] = unit_result + + weights = { + "dimensional_consistency": 0.40, + "symbolic_validity": 0.30, + "unit_consistency": 0.30, + } + score = sum(weights[k] * component_scores.get(k, 0.0) for k in weights) + score = min(1.0, round(score, 4)) + + elapsed = time.monotonic() - start + details["component_scores"] = component_scores + + return VerificationResult( + passed=score >= 0.5, + score=score, + badge=VerificationResult.score_to_badge(score), + domain=self.domain, + details=details, + warnings=warnings, + compute_time_seconds=elapsed, + ) + + # ------------------------------------------------------------------ + # dimensional_analysis + # ------------------------------------------------------------------ + + async def _verify_dimensions(self, result: dict) -> VerificationResult: + start = time.monotonic() + + expression = result.get("expression") + lhs = result.get("lhs") + rhs = result.get("rhs") + units = result.get("units", {}) + + if not expression and not (lhs and rhs): + return VerificationResult.fail(self.domain, ["No expression or lhs/rhs"]) + + component_scores: dict[str, float] = {} + details: dict[str, Any] = {"claim_type": "dimensional_analysis"} + + # Component 1: Dimensions match (0.50) + dim_result = await asyncio.to_thread( + self._check_dimensional_consistency, expression or f"({lhs}) - ({rhs})", units, + ) + component_scores["dimensions_match"] = dim_result["score"] + details["dimensions"] = dim_result + + # Component 2: Units consistent (0.30) + unit_result = await asyncio.to_thread( + self._check_unit_consistency, units, + ) + component_scores["units_consistent"] = unit_result["score"] + details["units"] = unit_result + + # Component 3: Expression valid (0.20) + expr_result = await asyncio.to_thread( + self._check_expression_valid, expression or f"({lhs}) - ({rhs})", + ) + component_scores["expression_valid"] = expr_result["score"] + details["expression"] = expr_result + + weights = {"dimensions_match": 0.50, "units_consistent": 0.30, "expression_valid": 0.20} + score = sum(weights[k] * component_scores.get(k, 0.0) for k in weights) + score = min(1.0, round(score, 4)) + + elapsed = time.monotonic() - start + details["component_scores"] = component_scores + + return VerificationResult( + passed=score >= 0.5, + score=score, + badge=VerificationResult.score_to_badge(score), + domain=self.domain, + details=details, + compute_time_seconds=elapsed, + ) + + # ------------------------------------------------------------------ + # Component check implementations + # ------------------------------------------------------------------ + + @staticmethod + def _check_conservation( + conservation: dict, sim_data: dict, + ) -> dict: + """Check conservation of energy/momentum/mass.""" + if not conservation: + return {"score": 0.5, "applicable": False, "note": "No conservation quantities"} + + results: list[dict] = [] + conserved = 0 + + for quantity, data in conservation.items(): + if not isinstance(data, dict): + continue + + initial = data.get("initial") + final = data.get("final") + + if initial is None or final is None: + continue + + if not isinstance(initial, (int, float)) or not isinstance(final, (int, float)): + continue + + tolerance = data.get("tolerance", max(abs(initial) * 0.01, 1e-10)) + deviation = abs(final - initial) + is_conserved = deviation <= tolerance + + results.append({ + "quantity": quantity, + "initial": initial, + "final": final, + "deviation": deviation, + "tolerance": tolerance, + "conserved": is_conserved, + }) + + if is_conserved: + conserved += 1 + + if not results: + return {"score": 0.5, "applicable": False, "note": "No initial/final pairs"} + + score = conserved / len(results) + return { + "score": round(score, 4), + "applicable": True, + "conserved": conserved, + "total": len(results), + "results": results, + "warnings": [f"Conservation violated for {len(results) - conserved} quantit(ies)"] + if conserved < len(results) else [], + } + + @staticmethod + def _check_stability(sim_data: dict) -> dict: + """Check for diverging quantities (NaN, Inf, exponential growth).""" + time_series = sim_data.get("time_series", {}) + if not time_series: + return {"score": 0.5, "applicable": False, "note": "No time series data"} + + issues: list[str] = [] + + for name, values in time_series.items(): + if not isinstance(values, list): + continue + + has_nan = any( + (isinstance(v, float) and (math.isnan(v) or math.isinf(v))) + for v in values if isinstance(v, (int, float)) + ) + if has_nan: + issues.append(f"{name}: contains NaN/Inf") + continue + + # Check for exponential growth in last quarter + numeric_vals = [float(v) for v in values if isinstance(v, (int, float))] + if len(numeric_vals) < 4: + continue + + quarter = len(numeric_vals) // 4 + last_quarter = numeric_vals[-quarter:] + first_quarter = numeric_vals[:quarter] + + if first_quarter and last_quarter: + first_mean = sum(abs(v) for v in first_quarter) / len(first_quarter) + last_mean = sum(abs(v) for v in last_quarter) / len(last_quarter) + + if first_mean > 0 and last_mean / first_mean > 100: + issues.append(f"{name}: possible exponential growth (ratio={last_mean / first_mean:.0f})") + + if not time_series: + score = 0.5 + elif not issues: + score = 1.0 + else: + score = max(0.0, 1.0 - 0.3 * len(issues)) + + return { + "score": round(score, 4), + "applicable": bool(time_series), + "series_checked": len(time_series), + "issues": issues[:10], + } + + @staticmethod + def _check_convergence(sim_data: dict) -> dict: + """Check if error decreases with mesh refinement.""" + refinement = sim_data.get("mesh_refinement") or sim_data.get("convergence_data") + if not refinement: + return {"score": 0.5, "applicable": False, "note": "No convergence data"} + + if isinstance(refinement, list) and len(refinement) >= 2: + # Expect list of {resolution, error} dicts + errors = [] + for entry in refinement: + if isinstance(entry, dict) and "error" in entry: + errors.append(float(entry["error"])) + + if len(errors) >= 2: + # Error should decrease monotonically + decreasing = all(errors[i] >= errors[i + 1] for i in range(len(errors) - 1)) + if decreasing: + score = 1.0 + else: + # Partial credit for mostly decreasing + n_decreasing = sum(1 for i in range(len(errors) - 1) if errors[i] >= errors[i + 1]) + score = n_decreasing / (len(errors) - 1) + + return { + "score": round(score, 4), + "applicable": True, + "errors": errors, + "monotonically_decreasing": decreasing, + } + + return {"score": 0.5, "applicable": False, "note": "Could not parse convergence data"} + + @staticmethod + def _check_boundary_conditions(sim_data: dict) -> dict: + """Check boundary values consistency.""" + boundaries = sim_data.get("boundary_conditions", {}) + if not boundaries: + return {"score": 0.5, "applicable": False, "note": "No boundary conditions specified"} + + results = sim_data.get("boundary_results", {}) + if not results: + return {"score": 0.5, "applicable": False, "note": "No boundary results to check"} + + matches = 0 + total = 0 + checks: list[dict] = [] + + for location, expected in boundaries.items(): + actual = results.get(location) + if actual is None: + checks.append({"location": location, "match": False, "note": "No result"}) + total += 1 + continue + + total += 1 + if isinstance(expected, (int, float)) and isinstance(actual, (int, float)): + tolerance = max(abs(expected) * 0.01, 1e-10) + match = abs(expected - actual) <= tolerance + else: + match = expected == actual + + checks.append({ + "location": location, + "expected": expected, + "actual": actual, + "match": match, + }) + if match: + matches += 1 + + score = matches / total if total > 0 else 0.5 + return { + "score": round(score, 4), + "applicable": True, + "checks": checks, + "matches": matches, + "total": total, + } + + @staticmethod + def _check_dimensional_consistency(expression: str, units: dict) -> dict: + """Check all terms in an equation have the same dimensions.""" + if not PINT_AVAILABLE: + return {"score": 0.5, "note": "pint unavailable"} + + if not units: + return {"score": 0.5, "applicable": False, "note": "No units specified"} + + try: + # Try to parse each variable's units and check consistency + parsed_units: dict[str, Any] = {} + for var_name, unit_str in units.items(): + try: + parsed_units[var_name] = _ureg.parse_expression(unit_str) + except Exception: + return { + "score": 0.3, + "applicable": True, + "error": f"Could not parse unit: {unit_str} for {var_name}", + } + + # If we have lhs_units and rhs_units, check they're compatible + lhs_unit = parsed_units.get("lhs") or parsed_units.get("result") + rhs_unit = parsed_units.get("rhs") or parsed_units.get("expression") + + if lhs_unit is not None and rhs_unit is not None: + try: + lhs_unit.to(rhs_unit.units) + return { + "score": 1.0, + "applicable": True, + "lhs_dimensions": str(lhs_unit.dimensionality), + "rhs_dimensions": str(rhs_unit.dimensionality), + "compatible": True, + } + except pint.DimensionalityError: + return { + "score": 0.0, + "applicable": True, + "lhs_dimensions": str(lhs_unit.dimensionality), + "rhs_dimensions": str(rhs_unit.dimensionality), + "compatible": False, + } + + return {"score": 0.7, "applicable": True, "note": "Units parsed but no LHS/RHS pair to compare"} + + except Exception as e: + return {"score": 0.3, "applicable": True, "error": str(e)} + + @staticmethod + def _check_symbolic_validity( + expression: str | None, + lhs: str | None, + rhs: str | None, + ) -> dict: + """Check expression parses and simplifies correctly.""" + if not SYMPY_AVAILABLE: + return {"score": 0.5, "note": "sympy unavailable"} + + try: + transformations = standard_transformations + (implicit_multiplication_application,) + + if expression: + expr = parse_expr(expression, transformations=transformations) + simplified = sympy.simplify(expr) + return { + "score": 1.0, + "applicable": True, + "parsed": str(expr), + "simplified": str(simplified), + "is_zero": simplified == 0, + } + + if lhs and rhs: + lhs_expr = parse_expr(lhs, transformations=transformations) + rhs_expr = parse_expr(rhs, transformations=transformations) + diff = sympy.simplify(lhs_expr - rhs_expr) + is_equal = diff == 0 + + return { + "score": 1.0 if is_equal else 0.7, + "applicable": True, + "lhs_parsed": str(lhs_expr), + "rhs_parsed": str(rhs_expr), + "difference": str(diff), + "symbolically_equal": is_equal, + } + + return {"score": 0.5, "applicable": False, "note": "No expression to parse"} + + except Exception as e: + return {"score": 0.0, "applicable": True, "error": f"Parse error: {str(e)}"} + + @staticmethod + def _check_unit_consistency(units: dict) -> dict: + """Verify units convert correctly between systems.""" + if not PINT_AVAILABLE: + return {"score": 0.5, "note": "pint unavailable"} + + if not units: + return {"score": 0.5, "applicable": False, "note": "No units"} + + conversions = units.get("conversions", []) + if not conversions: + # Just check all units are parseable + parseable = 0 + for name, unit_str in units.items(): + if name == "conversions": + continue + try: + _ureg.parse_expression(unit_str) + parseable += 1 + except Exception: + pass + + total = len([k for k in units if k != "conversions"]) + score = parseable / total if total > 0 else 0.5 + return { + "score": round(score, 4), + "applicable": True, + "parseable": parseable, + "total": total, + } + + # Check explicit conversions + correct = 0 + for conv in conversions: + if not isinstance(conv, dict): + continue + from_val = conv.get("from_value") + from_unit = conv.get("from_unit") + to_val = conv.get("to_value") + to_unit = conv.get("to_unit") + + if None in (from_val, from_unit, to_val, to_unit): + continue + + try: + quantity = _ureg.Quantity(float(from_val), from_unit) + converted = quantity.to(to_unit).magnitude + tolerance = max(abs(float(to_val)) * 0.01, 1e-10) + if abs(converted - float(to_val)) <= tolerance: + correct += 1 + except Exception: + pass + + score = correct / len(conversions) if conversions else 0.5 + return { + "score": round(score, 4), + "applicable": True, + "correct_conversions": correct, + "total_conversions": len(conversions), + } + + @staticmethod + def _check_expression_valid(expression: str) -> dict: + """Check if expression is syntactically valid.""" + if not SYMPY_AVAILABLE: + return {"score": 0.5, "note": "sympy unavailable"} + + try: + transformations = standard_transformations + (implicit_multiplication_application,) + expr = parse_expr(expression, transformations=transformations) + return { + "score": 1.0, + "applicable": True, + "parsed": str(expr), + "free_symbols": [str(s) for s in expr.free_symbols], + } + except Exception as e: + return {"score": 0.0, "applicable": True, "error": str(e)} diff --git a/backend/verification/reproducibility_executor.py b/backend/verification/reproducibility_executor.py new file mode 100644 index 0000000..b54d61a --- /dev/null +++ b/backend/verification/reproducibility_executor.py @@ -0,0 +1,317 @@ +"""Cross-cutting verifier: Reproducibility Executor. + +Clones a code repository, installs dependencies, runs the code in a +Docker sandbox, and compares outputs against claimed results. +""" +from __future__ import annotations + +import asyncio +import hashlib +import json +import tempfile +import time +from pathlib import Path +from typing import Any + +from backend.logging_config import get_logger +from backend.verification.cross_cutting_base import ( + CrossCuttingResult, + CrossCuttingVerifier, +) + +logger = get_logger(__name__) + +REPRO_IMAGE = "clawdlab/reproducibility:latest" +REPRO_TIMEOUT = 300 # 5 minutes +CLONE_TIMEOUT = 60 + + +class ReproducibilityExecutor(CrossCuttingVerifier): + name = "reproducibility" + default_weight = 0.15 + requires_docker = True + + def is_applicable(self, task_result: dict, task_metadata: dict) -> bool: + return bool(task_result.get("code_repo") and task_result.get("code_commit")) + + async def verify(self, task_result: dict, task_metadata: dict) -> CrossCuttingResult: + start = time.monotonic() + + code_repo = task_result["code_repo"] + code_commit = task_result["code_commit"] + claimed_results = task_result.get("claimed_results", {}) + output_checksums = task_result.get("output_checksums", {}) + entry_point = task_result.get("entry_point") + + component_scores: dict[str, float] = {} + details: dict[str, Any] = { + "repo": code_repo, + "commit": code_commit, + } + warnings: list[str] = [] + errors: list[str] = [] + + with tempfile.TemporaryDirectory() as tmpdir: + # Component 1: Repo cloneable (0.15) + clone_ok, clone_detail = await self._clone_repo(code_repo, code_commit, tmpdir) + component_scores["repo_cloneable"] = 1.0 if clone_ok else 0.0 + details["clone"] = clone_detail + if not clone_ok: + errors.append(clone_detail.get("error", "Clone failed")) + elapsed = time.monotonic() - start + return CrossCuttingResult( + verifier_name=self.name, + score=0.0, + weight=self.default_weight, + details=details, + errors=errors, + compute_time_seconds=elapsed, + ) + + # Component 2: Deps installable (0.25) + deps_ok, deps_detail = await self._check_deps(tmpdir) + component_scores["deps_installable"] = deps_detail.get("score", 0.0) + details["deps"] = deps_detail + + # Component 3: Execution success (0.35) + exec_ok, exec_detail = await self._execute(tmpdir, entry_point) + component_scores["execution_success"] = 1.0 if exec_ok else 0.0 + details["execution"] = exec_detail + if not exec_ok and exec_detail.get("error"): + warnings.append(f"Execution: {exec_detail['error']}") + + # Component 4: Output match (0.25) + output_score, output_detail = self._check_outputs( + exec_detail.get("outputs", {}), + claimed_results, + output_checksums, + ) + component_scores["output_match"] = output_score + details["output_match"] = output_detail + + weights = { + "repo_cloneable": 0.15, + "deps_installable": 0.25, + "execution_success": 0.35, + "output_match": 0.25, + } + + score = sum(weights[k] * component_scores.get(k, 0.0) for k in weights) + score = min(1.0, round(score, 4)) + details["component_scores"] = component_scores + + elapsed = time.monotonic() - start + return CrossCuttingResult( + verifier_name=self.name, + score=score, + weight=self.default_weight, + details=details, + errors=errors, + warnings=warnings, + compute_time_seconds=elapsed, + ) + + async def _clone_repo( + self, repo_url: str, commit: str, workdir: str, + ) -> tuple[bool, dict]: + """Clone repo and checkout specific commit.""" + try: + proc = await asyncio.create_subprocess_exec( + "git", "clone", "--depth", "50", repo_url, f"{workdir}/repo", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for( + proc.communicate(), timeout=CLONE_TIMEOUT, + ) + if proc.returncode != 0: + return False, { + "error": stderr.decode(errors="replace")[:500], + "cloned": False, + } + + # Checkout commit + proc2 = await asyncio.create_subprocess_exec( + "git", "-C", f"{workdir}/repo", "checkout", commit, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout2, stderr2 = await asyncio.wait_for( + proc2.communicate(), timeout=15, + ) + + checked_out = proc2.returncode == 0 + return checked_out, { + "cloned": True, + "checked_out": checked_out, + "commit": commit, + } + except asyncio.TimeoutError: + return False, {"error": "Clone timed out", "cloned": False} + except Exception as e: + return False, {"error": str(e), "cloned": False} + + async def _check_deps(self, workdir: str) -> tuple[bool, dict]: + """Check if dependency files exist and are installable.""" + repo_path = Path(workdir) / "repo" + dep_files = { + "requirements.txt": "pip install -r requirements.txt", + "pyproject.toml": "pip install .", + "setup.py": "pip install .", + "environment.yml": "conda env create -f environment.yml", + } + + found: list[str] = [] + for f in dep_files: + if (repo_path / f).exists(): + found.append(f) + + if not found: + return False, { + "score": 0.3, + "found": [], + "note": "No dependency files found", + } + + return True, { + "score": 1.0 if "requirements.txt" in found or "pyproject.toml" in found else 0.7, + "found": found, + } + + async def _execute( + self, workdir: str, entry_point: str | None, + ) -> tuple[bool, dict]: + """Run the code in a Docker sandbox.""" + repo_path = Path(workdir) / "repo" + + # Determine entry point + if not entry_point: + entry_point = self._detect_entry_point(repo_path) + + if not entry_point: + return False, { + "error": "No entry point found (Makefile, run.sh, main.py, reproduce.py)", + "outputs": {}, + } + + # Build docker command + cmd = [ + "docker", "run", "--rm", + "--network=none", + "--memory=4g", + "--cpus=2", + "-v", f"{repo_path}:/workspace:ro", + "-w", "/workspace", + REPRO_IMAGE, + ] + + if entry_point == "Makefile": + cmd.extend(["make", "reproduce"]) + elif entry_point.endswith(".sh"): + cmd.extend(["bash", entry_point]) + else: + cmd.extend(["python", entry_point]) + + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for( + proc.communicate(), timeout=REPRO_TIMEOUT, + ) + + stdout_text = stdout.decode(errors="replace")[:5000] + stderr_text = stderr.decode(errors="replace")[:2000] + success = proc.returncode == 0 + + # Try to parse JSON output + outputs: dict = {} + try: + outputs = json.loads(stdout_text) + except (json.JSONDecodeError, ValueError): + outputs = {"raw_output": stdout_text[:1000]} + + return success, { + "exit_code": proc.returncode, + "entry_point": entry_point, + "stdout_preview": stdout_text[:500], + "stderr_preview": stderr_text[:500] if not success else "", + "outputs": outputs, + } + + except asyncio.TimeoutError: + return False, {"error": f"Execution timed out ({REPRO_TIMEOUT}s)", "outputs": {}} + except FileNotFoundError: + return False, {"error": "Docker not available", "outputs": {}} + except Exception as e: + return False, {"error": str(e), "outputs": {}} + + @staticmethod + def _detect_entry_point(repo_path: Path) -> str | None: + """Auto-detect the entry point for reproduction.""" + candidates = ["reproduce.py", "run.sh", "main.py", "Makefile"] + for c in candidates: + if (repo_path / c).exists(): + return c + return None + + @staticmethod + def _check_outputs( + actual_outputs: dict, + claimed_results: dict, + output_checksums: dict, + ) -> tuple[float, dict]: + """Compare actual outputs against claimed results and checksums.""" + if not claimed_results and not output_checksums: + return 0.5, {"note": "No claimed results to compare against"} + + checks: list[dict] = [] + matches = 0 + total = 0 + + # Numeric comparison with tolerance + for key, claimed in claimed_results.items(): + actual = actual_outputs.get(key) + if actual is None: + checks.append({"key": key, "match": False, "note": "Missing in output"}) + total += 1 + continue + + total += 1 + if isinstance(claimed, (int, float)) and isinstance(actual, (int, float)): + tolerance = max(abs(claimed) * 0.05, 1e-6) + match = abs(claimed - actual) <= tolerance + checks.append({ + "key": key, "match": match, + "claimed": claimed, "actual": actual, + "tolerance": tolerance, + }) + else: + match = str(claimed) == str(actual) + checks.append({"key": key, "match": match}) + + if match: + matches += 1 + + # Checksum verification + for filename, expected_hash in output_checksums.items(): + total += 1 + actual_data = actual_outputs.get(filename) + if actual_data: + actual_hash = hashlib.sha256(str(actual_data).encode()).hexdigest() + match = actual_hash == expected_hash + checks.append({ + "key": f"checksum:{filename}", + "match": match, + "expected": expected_hash[:16] + "...", + "actual": actual_hash[:16] + "...", + }) + if match: + matches += 1 + else: + checks.append({"key": f"checksum:{filename}", "match": False, "note": "File not in output"}) + + score = matches / total if total > 0 else 0.5 + return round(score, 4), {"checks": checks, "matches": matches, "total": total} diff --git a/backend/verification/statistical_forensics.py b/backend/verification/statistical_forensics.py new file mode 100644 index 0000000..cabe8ad --- /dev/null +++ b/backend/verification/statistical_forensics.py @@ -0,0 +1,437 @@ +"""Cross-cutting verifier: Statistical Forensics. + +Detects fabricated or implausible statistics via: +- GRIM test (granularity-related inconsistency of means) +- SPRITE test (sample parameter reconstruction via iteration) +- Benford's law (first-digit distribution) +- P-curve analysis (p-value distribution shape) +""" +from __future__ import annotations + +import asyncio +import math +import random +import time +from typing import Any + +from backend.logging_config import get_logger +from backend.verification.cross_cutting_base import ( + CrossCuttingResult, + CrossCuttingVerifier, +) + +logger = get_logger(__name__) + + +class StatisticalForensicsVerifier(CrossCuttingVerifier): + name = "statistical_forensics" + default_weight = 0.10 + + def is_applicable(self, task_result: dict, task_metadata: dict) -> bool: + stat_keys = {"statistical_claims", "means", "p_values", "metrics", "results_summary"} + return any(k in task_result and task_result[k] for k in stat_keys) + + async def verify(self, task_result: dict, task_metadata: dict) -> CrossCuttingResult: + start = time.monotonic() + + component_scores: dict[str, float] = {} + details: dict[str, Any] = {} + warnings: list[str] = [] + + # Extract data for each test + means_data = self._extract_means(task_result) + p_values = self._extract_p_values(task_result) + all_numbers = self._extract_all_numbers(task_result) + + # Run all tests concurrently via threads (CPU-bound) + grim_task = asyncio.to_thread(self._run_grim, means_data) if means_data else _noop_result("grim", "No means data") + sprite_task = asyncio.to_thread(self._run_sprite, means_data) if means_data else _noop_result("sprite", "No means data") + benford_task = asyncio.to_thread(self._run_benford, all_numbers) if len(all_numbers) >= 10 else _noop_result("benford", "Insufficient numbers (<10)") + pcurve_task = asyncio.to_thread(self._run_pcurve, p_values) if len(p_values) >= 3 else _noop_result("pcurve", "Insufficient p-values (<3)") + + grim_result, sprite_result, benford_result, pcurve_result = await asyncio.gather( + grim_task, sprite_task, benford_task, pcurve_task, + ) + + for name, result in [("grim", grim_result), ("sprite", sprite_result), + ("benford", benford_result), ("pcurve", pcurve_result)]: + component_scores[name] = result.get("score", 0.5) + details[name] = result + if result.get("warnings"): + warnings.extend(result["warnings"]) + + # Equal weight for all 4 components + applicable = [k for k in component_scores if details[k].get("applicable", True)] + if applicable: + score = sum(component_scores[k] for k in applicable) / len(applicable) + else: + score = 0.5 # neutral + + elapsed = time.monotonic() - start + + return CrossCuttingResult( + verifier_name=self.name, + score=round(score, 4), + weight=self.default_weight, + details=details, + warnings=warnings, + compute_time_seconds=elapsed, + ) + + # ------------------------------------------------------------------ + # Data extraction + # ------------------------------------------------------------------ + + @staticmethod + def _extract_means(task_result: dict) -> list[dict]: + """Extract mean/n/sd triples from results.""" + means = task_result.get("means", []) + if isinstance(means, list): + return [m for m in means if isinstance(m, dict) and "mean" in m] + + # Also check statistical_claims + claims = task_result.get("statistical_claims", []) + extracted = [] + for claim in claims: + if isinstance(claim, dict) and "mean" in claim: + extracted.append(claim) + return extracted + + @staticmethod + def _extract_p_values(task_result: dict) -> list[float]: + """Extract p-values from results.""" + direct = task_result.get("p_values", []) + if isinstance(direct, list) and direct: + return [float(p) for p in direct if isinstance(p, (int, float)) and 0 < p < 1] + + # From statistical_claims + p_vals = [] + for claim in task_result.get("statistical_claims", []): + if isinstance(claim, dict): + p = claim.get("p_value") + if isinstance(p, (int, float)) and 0 < p < 1: + p_vals.append(float(p)) + return p_vals + + @staticmethod + def _extract_all_numbers(task_result: dict) -> list[float]: + """Recursively extract all numeric values from the result.""" + numbers: list[float] = [] + + def _walk(obj: Any) -> None: + if isinstance(obj, (int, float)) and not isinstance(obj, bool): + if obj != 0 and math.isfinite(obj): + numbers.append(float(abs(obj))) + elif isinstance(obj, dict): + for v in obj.values(): + _walk(v) + elif isinstance(obj, list): + for item in obj: + _walk(item) + + for key in ("metrics", "results_summary", "statistical_claims", "means", "p_values"): + if key in task_result: + _walk(task_result[key]) + return numbers + + # ------------------------------------------------------------------ + # GRIM test + # ------------------------------------------------------------------ + + @staticmethod + def _run_grim(means_data: list[dict]) -> dict: + """GRIM test: are reported means possible given sample size? + + For integer-valued measurements, n * mean must be an integer + (within rounding tolerance). + """ + if not means_data: + return {"score": 0.5, "applicable": False, "note": "No means data"} + + passed = 0 + failed = 0 + results: list[dict] = [] + + for entry in means_data: + mean = entry.get("mean") + n = entry.get("n") or entry.get("sample_size") + if mean is None or n is None: + continue + if not isinstance(n, (int, float)) or n <= 0: + continue + + n = int(n) + product = n * float(mean) + # Check if product is close to an integer + remainder = abs(product - round(product)) + # Allow for rounding to 2 decimal places + tolerance = n * 0.005 + 0.01 + is_consistent = remainder <= tolerance + + results.append({ + "mean": mean, "n": n, + "product": round(product, 4), + "remainder": round(remainder, 4), + "consistent": is_consistent, + }) + + if is_consistent: + passed += 1 + else: + failed += 1 + + if not results: + return {"score": 0.5, "applicable": False, "note": "No mean+n pairs"} + + score = passed / len(results) if results else 0.5 + return { + "score": round(score, 4), + "applicable": True, + "passed": passed, + "failed": failed, + "total": len(results), + "results": results[:10], + "warnings": [f"GRIM: {failed} inconsistent mean(s)"] if failed else [], + } + + # ------------------------------------------------------------------ + # SPRITE test + # ------------------------------------------------------------------ + + @staticmethod + def _run_sprite(means_data: list[dict]) -> dict: + """SPRITE: can a mean+SD combination be achieved with integer data? + + Uses simulated annealing to find a valid dataset, capped at n=200. + """ + results: list[dict] = [] + passed = 0 + failed = 0 + + for entry in means_data: + mean = entry.get("mean") + sd = entry.get("sd") or entry.get("std") + n = entry.get("n") or entry.get("sample_size") + scale_min = entry.get("scale_min", 1) + scale_max = entry.get("scale_max", 7) + + if mean is None or sd is None or n is None: + continue + n = int(n) + if n <= 0 or n > 200: + continue + + mean, sd = float(mean), float(sd) + achievable = _sprite_check(mean, sd, n, int(scale_min), int(scale_max)) + + results.append({ + "mean": mean, "sd": sd, "n": n, + "achievable": achievable, + }) + + if achievable: + passed += 1 + else: + failed += 1 + + if not results: + return {"score": 0.5, "applicable": False, "note": "No mean+sd+n triples"} + + score = passed / len(results) if results else 0.5 + return { + "score": round(score, 4), + "applicable": True, + "passed": passed, + "failed": failed, + "total": len(results), + "results": results[:10], + "warnings": [f"SPRITE: {failed} implausible mean/SD combination(s)"] if failed else [], + } + + # ------------------------------------------------------------------ + # Benford's law + # ------------------------------------------------------------------ + + @staticmethod + def _run_benford(numbers: list[float]) -> dict: + """Check first-digit distribution against Benford's law.""" + if len(numbers) < 10: + return {"score": 0.5, "applicable": False, "note": "Too few numbers"} + + # Count first digits + digit_counts = [0] * 10 + for num in numbers: + s = f"{abs(num):.10g}".lstrip("0").lstrip(".") + if s and s[0].isdigit(): + d = int(s[0]) + if 1 <= d <= 9: + digit_counts[d] += 1 + + total = sum(digit_counts[1:]) + if total < 10: + return {"score": 0.5, "applicable": False, "note": "Too few leading digits"} + + # Expected Benford frequencies + expected = [0.0] + [math.log10(1 + 1 / d) for d in range(1, 10)] + observed_freq = [0.0] + [digit_counts[d] / total for d in range(1, 10)] + + # Chi-square statistic + chi2 = sum( + (observed_freq[d] - expected[d]) ** 2 / expected[d] + for d in range(1, 10) + ) * total + + # 8 degrees of freedom, critical value at p=0.05 is 15.507 + p_approx = _chi2_survival(chi2, 8) + + # Score based on p-value: high p = consistent with Benford + if p_approx > 0.10: + score = 1.0 + elif p_approx > 0.05: + score = 0.7 + elif p_approx > 0.01: + score = 0.4 + else: + score = 0.1 + + return { + "score": round(score, 4), + "applicable": True, + "chi2": round(chi2, 4), + "p_value_approx": round(p_approx, 6), + "digit_counts": {str(d): digit_counts[d] for d in range(1, 10)}, + "total_numbers": total, + "warnings": [f"Benford's law: chi2={chi2:.2f}, p={p_approx:.4f}"] if p_approx < 0.05 else [], + } + + # ------------------------------------------------------------------ + # P-curve analysis + # ------------------------------------------------------------------ + + @staticmethod + def _run_pcurve(p_values: list[float]) -> dict: + """P-curve: significant p-values should be right-skewed under real effects.""" + if len(p_values) < 3: + return {"score": 0.5, "applicable": False, "note": "Too few p-values"} + + sig_ps = [p for p in p_values if 0 < p < 0.05] + if len(sig_ps) < 3: + return {"score": 0.5, "applicable": False, "note": "Too few significant p-values"} + + # Under a real effect, p-values < 0.05 should be right-skewed + # (more p-values near 0 than near 0.05) + # Under p-hacking, distribution is uniform or left-skewed + + # Simple test: proportion below 0.025 should be > 0.5 if real effect + below_midpoint = sum(1 for p in sig_ps if p < 0.025) + prop_below = below_midpoint / len(sig_ps) + + # KS test against uniform on [0, 0.05] + # Normalise to [0, 1] + normalised = sorted([p / 0.05 for p in sig_ps]) + n = len(normalised) + ks_stat = max( + max(abs((i + 1) / n - normalised[i]) for i in range(n)), + max(abs(normalised[i] - i / n) for i in range(n)), + ) + + # KS critical value approximation at alpha=0.05: 1.36 / sqrt(n) + ks_critical = 1.36 / math.sqrt(n) + uniform_rejected = ks_stat > ks_critical + + # Score: right-skewed = good (real effect), uniform/left-skewed = suspicious + if prop_below > 0.6: + score = 1.0 + elif prop_below > 0.4: + score = 0.7 if not uniform_rejected else 0.5 + else: + score = 0.3 + + return { + "score": round(score, 4), + "applicable": True, + "significant_p_count": len(sig_ps), + "total_p_count": len(p_values), + "proportion_below_025": round(prop_below, 4), + "ks_statistic": round(ks_stat, 4), + "ks_critical_005": round(ks_critical, 4), + "uniform_rejected": uniform_rejected, + "warnings": ["P-curve suggests possible p-hacking"] if score < 0.5 else [], + } + + +# ------------------------------------------------------------------ +# Utility functions +# ------------------------------------------------------------------ + + +def _sprite_check( + target_mean: float, + target_sd: float, + n: int, + scale_min: int, + scale_max: int, + max_iter: int = 5000, +) -> bool: + """Simulated annealing SPRITE check.""" + if n <= 0: + return False + + rng = random.Random(42) + + # Initialise dataset + data = [rng.randint(scale_min, scale_max) for _ in range(n)] + + # Compute target sum + target_sum = target_mean * n + target_var = target_sd ** 2 + + for _ in range(max_iter): + current_mean = sum(data) / n + current_var = sum((x - current_mean) ** 2 for x in data) / max(n - 1, 1) if n > 1 else 0 + current_sd = math.sqrt(current_var) if current_var > 0 else 0 + + mean_ok = abs(current_mean - target_mean) < 0.005 + sd_ok = abs(current_sd - target_sd) < 0.05 + + if mean_ok and sd_ok: + return True + + # Adjust a random element + idx = rng.randint(0, n - 1) + old_val = data[idx] + if current_mean < target_mean: + new_val = min(old_val + 1, scale_max) + elif current_mean > target_mean: + new_val = max(old_val - 1, scale_min) + else: + new_val = rng.randint(scale_min, scale_max) + data[idx] = new_val + + return False + + +def _chi2_survival(x: float, df: int) -> float: + """Approximate chi-squared survival function P(X > x). + + Uses the Wilson-Hilferty normal approximation. + """ + if x <= 0: + return 1.0 + if df <= 0: + return 0.0 + + z = ((x / df) ** (1 / 3) - (1 - 2 / (9 * df))) / math.sqrt(2 / (9 * df)) + + # Standard normal CDF approximation + t = 1.0 / (1.0 + 0.2316419 * abs(z)) + poly = t * (0.319381530 + t * (-0.356563782 + t * (1.781477937 + t * (-1.821255978 + 1.330274429 * t)))) + pdf = math.exp(-z * z / 2) / math.sqrt(2 * math.pi) + cdf = 1.0 - pdf * poly if z > 0 else pdf * poly + + return max(0.0, min(1.0, 1.0 - cdf)) + + +async def _noop_result(name: str, note: str) -> dict: + """Return a neutral result for non-applicable tests.""" + return {"score": 0.5, "applicable": False, "note": note} diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 97d620b..e9b95ea 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -48,6 +48,8 @@ services: LOG_LEVEL: INFO LOG_FORMAT: json CORS_ORIGINS: ${CORS_ORIGINS:-http://localhost} + volumes: + - /var/run/docker.sock:/var/run/docker.sock depends_on: postgres: condition: service_healthy diff --git a/docker-compose.yml b/docker-compose.yml index e9620a8..e0df999 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -62,6 +62,8 @@ services: CORS_ORIGINS: ${CORS_ORIGINS:-http://localhost:3000,http://localhost:5173,http://localhost} PI_UPDATE_INTERVAL_HOURS: ${PI_UPDATE_INTERVAL_HOURS:-12} DISABLE_SCHEDULER: ${DISABLE_SCHEDULER:-false} + volumes: + - /var/run/docker.sock:/var/run/docker.sock depends_on: postgres: condition: service_healthy diff --git a/tests/test_verification/test_chemistry_adapter.py b/tests/test_verification/test_chemistry_adapter.py new file mode 100644 index 0000000..5e90711 --- /dev/null +++ b/tests/test_verification/test_chemistry_adapter.py @@ -0,0 +1,173 @@ +"""Tests for chemistry domain adapter.""" +import pytest +from unittest.mock import patch, AsyncMock + +from backend.verification.chemistry_adapter import ChemistryAdapter, RDKIT_AVAILABLE + + +@pytest.fixture +def adapter(): + return ChemistryAdapter() + + +class TestBasic: + def test_domain(self, adapter): + assert adapter.domain == "chemistry" + + +@pytest.mark.asyncio +class TestReactionMechanism: + async def test_unknown_claim_type(self, adapter): + result = await adapter.verify({"claim_type": "unknown"}, {}) + assert result.passed is False + assert "Unknown claim_type" in result.errors[0] + + async def test_no_reactants_or_products(self, adapter): + result = await adapter.verify({"claim_type": "reaction_mechanism"}, {}) + assert result.passed is False + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + async def test_valid_reaction(self, adapter): + result = await adapter.verify({ + "claim_type": "reaction_mechanism", + "smiles": "CC.O>>CCO", + }, {}) + assert result.score > 0.0 + assert result.details["claim_type"] == "reaction_mechanism" + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + async def test_invalid_smiles(self, adapter): + result = await adapter.verify({ + "claim_type": "reaction_mechanism", + "reactants": ["INVALID_SMILES"], + "products": ["ALSO_INVALID"], + }, {}) + assert result.details["smiles_validity"]["valid"] == 0 + + +class TestSMILESValidity: + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + def test_valid_smiles(self): + result = ChemistryAdapter._check_smiles_validity(["CCO", "CC(=O)O", "c1ccccc1"]) + assert result["score"] == 1.0 + assert result["valid"] == 3 + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + def test_invalid_smiles(self): + result = ChemistryAdapter._check_smiles_validity(["INVALID", "ALSO_BAD"]) + assert result["score"] == 0.0 + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + def test_mixed_smiles(self): + result = ChemistryAdapter._check_smiles_validity(["CCO", "INVALID"]) + assert result["score"] == 0.5 + + def test_without_rdkit(self): + if not RDKIT_AVAILABLE: + result = ChemistryAdapter._check_smiles_validity(["CCO"]) + assert result["score"] == 0.5 + + +class TestStoichiometry: + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + def test_balanced(self): + # Simple: ethanol formation C2H6 + O -> C2H5OH (simplified, won't balance perfectly) + result = ChemistryAdapter._check_stoichiometry(["CCO"], ["CCO"]) + assert result["score"] == 1.0 + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + def test_invalid_smiles_returns_zero(self): + result = ChemistryAdapter._check_stoichiometry(["INVALID"], ["INVALID"]) + assert result["score"] == 0.0 + + +@pytest.mark.asyncio +class TestMolecularProperty: + async def test_no_smiles(self, adapter): + result = await adapter.verify({ + "claim_type": "molecular_property", + }, {}) + assert result.passed is False + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + @patch("backend.verification.chemistry_adapter.ChemistryAdapter._check_pubchem") + @patch("backend.verification.chemistry_adapter.ChemistryAdapter._check_chembl") + async def test_valid_molecule(self, mock_chembl, mock_pubchem, adapter): + mock_pubchem.return_value = {"score": 0.8, "found": True} + mock_chembl.return_value = {"score": 0.7, "found": True} + + result = await adapter.verify({ + "claim_type": "molecular_property", + "smiles": "CCO", + "claimed_properties": {"molecular_weight": 46.07}, + }, {}) + assert result.score > 0.0 + assert result.details["claim_type"] == "molecular_property" + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + async def test_invalid_smiles(self, adapter): + result = await adapter.verify({ + "claim_type": "molecular_property", + "smiles": "INVALID_SMILES_STRING", + }, {}) + assert result.passed is False + + +@pytest.mark.asyncio +class TestRetrosynthesis: + async def test_no_precursors(self, adapter): + result = await adapter.verify({ + "claim_type": "retrosynthesis", + "products": ["CCO"], + }, {}) + assert result.passed is False + + async def test_no_products(self, adapter): + result = await adapter.verify({ + "claim_type": "retrosynthesis", + "precursors": ["CC", "O"], + }, {}) + assert result.passed is False + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + async def test_valid_retrosynthesis(self, adapter): + result = await adapter.verify({ + "claim_type": "retrosynthesis", + "precursors": ["CC=O", "CC"], + "products": ["CC(O)CC"], + }, {}) + assert result.score > 0.0 + assert result.details["claim_type"] == "retrosynthesis" + + +class TestPropertyRanges: + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + def test_matching_molecular_weight(self): + result = ChemistryAdapter._check_property_ranges( + "CCO", + {"molecular_weight": 46.07}, + ) + assert result["score"] > 0.0 + assert "computed_properties" in result + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + def test_invalid_smiles(self): + result = ChemistryAdapter._check_property_ranges( + "INVALID", + {"molecular_weight": 100.0}, + ) + assert result["score"] == 0.0 + + +class TestAtomConservation: + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + def test_conserved(self): + result = ChemistryAdapter._check_atom_conservation(["CC", "O"], ["CCO"]) + assert result["conserved"] is True + assert result["score"] == 1.0 + + @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="rdkit not installed") + def test_atoms_lost(self): + # Product has more atoms than precursors + result = ChemistryAdapter._check_atom_conservation(["C"], ["CCCCCC"]) + assert result["score"] < 1.0 diff --git a/tests/test_verification/test_citation_verifier.py b/tests/test_verification/test_citation_verifier.py new file mode 100644 index 0000000..2ddbfac --- /dev/null +++ b/tests/test_verification/test_citation_verifier.py @@ -0,0 +1,145 @@ +"""Tests for citation & reference verifier.""" +import pytest +from unittest.mock import AsyncMock, patch, MagicMock + +from backend.verification.citation_verifier import CitationVerifier, _jaccard_similarity + + +@pytest.fixture +def verifier(): + return CitationVerifier() + + +class TestApplicability: + def test_applicable_with_citations(self, verifier): + assert verifier.is_applicable({"citations": [{"title": "Test"}]}, {}) is True + + def test_applicable_with_references(self, verifier): + assert verifier.is_applicable({"references": [{"title": "Test"}]}, {}) is True + + def test_applicable_with_papers(self, verifier): + assert verifier.is_applicable({"papers": [{"title": "Test"}]}, {}) is True + + def test_applicable_with_bibliography(self, verifier): + assert verifier.is_applicable({"bibliography": [{"title": "Test"}]}, {}) is True + + def test_not_applicable_empty(self, verifier): + assert verifier.is_applicable({}, {}) is False + + def test_not_applicable_empty_list(self, verifier): + assert verifier.is_applicable({"citations": []}, {}) is False + + +class TestJaccardSimilarity: + def test_identical(self): + assert _jaccard_similarity("hello world", "hello world") == 1.0 + + def test_no_overlap(self): + assert _jaccard_similarity("hello world", "foo bar") == 0.0 + + def test_partial_overlap(self): + sim = _jaccard_similarity("the quick brown fox", "the lazy brown dog") + assert 0.0 < sim < 1.0 + + def test_empty_strings(self): + assert _jaccard_similarity("", "") == 0.0 + assert _jaccard_similarity("hello", "") == 0.0 + + +class TestExtractCitations: + def test_extract_from_list_of_dicts(self, verifier): + result = {"citations": [{"title": "Paper A", "doi": "10.1234/test"}]} + citations = verifier._extract_citations(result) + assert len(citations) == 1 + assert citations[0]["title"] == "Paper A" + + def test_extract_from_list_of_strings(self, verifier): + result = {"references": ["Paper A", "Paper B"]} + citations = verifier._extract_citations(result) + assert len(citations) == 2 + assert citations[0]["title"] == "Paper A" + + def test_extract_empty(self, verifier): + assert verifier._extract_citations({}) == [] + assert verifier._extract_citations({"citations": []}) == [] + + +class TestExtractDOI: + def test_extract_from_url(self): + doi = CitationVerifier._extract_doi_from_url("https://doi.org/10.1038/s41586-023-06474-x") + assert doi == "10.1038/s41586-023-06474-x" + + def test_no_doi_in_url(self): + doi = CitationVerifier._extract_doi_from_url("https://example.com/paper") + assert doi == "" + + +class TestFreshness: + def test_recent_paper_high_score(self): + citation = {"year": 2025} + score = CitationVerifier._check_freshness(citation, "ml_ai") + assert score == 1.0 + + def test_old_paper_fast_domain(self): + citation = {"year": 2010} + score = CitationVerifier._check_freshness(citation, "ml_ai") + assert score < 1.0 + + def test_old_paper_slow_domain(self): + citation = {"year": 2015} + score = CitationVerifier._check_freshness(citation, "mathematics") + assert score == 1.0 + + def test_no_year_neutral(self): + score = CitationVerifier._check_freshness({}, "ml_ai") + assert score == 0.5 + + +class TestClaimSupport: + def test_matching_text(self): + citation = {"claim_text": "deep learning improves accuracy on benchmarks"} + abstract = "deep learning methods improve accuracy on standard benchmarks significantly" + score = CitationVerifier._check_claim_support(citation, abstract) + assert score > 0.3 + + def test_no_claim_text(self): + score = CitationVerifier._check_claim_support({}, "some abstract") + assert score == 0.5 + + def test_no_abstract(self): + score = CitationVerifier._check_claim_support({"claim_text": "test"}, "") + assert score == 0.5 + + +@pytest.mark.asyncio +class TestVerify: + async def test_no_citations_returns_zero(self, verifier): + result = await verifier.verify({"no_citations": True}, {}) + assert result.score == 0.0 + assert len(result.errors) > 0 + + @patch("backend.verification.citation_verifier.CitationVerifier._resolve_doi") + @patch("backend.verification.citation_verifier.CitationVerifier._query_openalex") + @patch("backend.verification.citation_verifier.CitationVerifier._query_semantic_scholar") + async def test_single_citation_with_mocked_apis( + self, mock_ss, mock_oa, mock_doi, verifier, + ): + mock_doi.return_value = {"score": 1.0, "resolved": True, "title": "Test", "doi": "10.1234/test"} + mock_oa.return_value = {"score": 0.9, "source": "openalex", "matched_title": "Test", "similarity": 0.9, "abstract": "Test abstract", "year": 2024} + mock_ss.return_value = {"score": 0.5, "source": "semantic_scholar"} + + task_result = { + "citations": [{"title": "Test Paper", "doi": "10.1234/test", "year": 2024}], + } + result = await verifier.verify(task_result, {"domain": "general"}) + assert result.score > 0.0 + assert result.verifier_name == "citation_reference" + + async def test_caps_at_max_citations(self, verifier): + # 15 citations should be capped to 10 + task_result = { + "citations": [{"title": f"Paper {i}"} for i in range(15)], + } + # We just verify it doesn't crash — actual API calls will fail gracefully + result = await verifier.verify(task_result, {"domain": "general"}) + assert result.details.get("citations_checked", 0) <= 10 diff --git a/tests/test_verification/test_cross_cutting_base.py b/tests/test_verification/test_cross_cutting_base.py new file mode 100644 index 0000000..033a9f5 --- /dev/null +++ b/tests/test_verification/test_cross_cutting_base.py @@ -0,0 +1,88 @@ +"""Tests for cross-cutting verifier base classes.""" +import pytest + +from backend.verification.cross_cutting_base import ( + CrossCuttingResult, + CrossCuttingVerifier, +) + + +class TestCrossCuttingResult: + def test_defaults(self): + r = CrossCuttingResult( + verifier_name="test", + score=0.8, + weight=0.10, + ) + assert r.verifier_name == "test" + assert r.score == 0.8 + assert r.weight == 0.10 + assert r.details == {} + assert r.errors == [] + assert r.warnings == [] + assert r.compute_time_seconds == 0.0 + + def test_with_details(self): + r = CrossCuttingResult( + verifier_name="citation", + score=0.6, + weight=0.15, + details={"checked": 5}, + errors=["DOI failed"], + warnings=["Old reference"], + compute_time_seconds=1.5, + ) + assert r.details == {"checked": 5} + assert len(r.errors) == 1 + assert len(r.warnings) == 1 + assert r.compute_time_seconds == 1.5 + + def test_score_bounds(self): + r = CrossCuttingResult(verifier_name="t", score=0.0, weight=0.1) + assert r.score == 0.0 + + r2 = CrossCuttingResult(verifier_name="t", score=1.0, weight=0.1) + assert r2.score == 1.0 + + +class TestCrossCuttingVerifier: + def test_default_attributes(self): + v = CrossCuttingVerifier() + assert v.name == "" + assert v.default_weight == 0.10 + assert v.requires_docker is False + + def test_is_applicable_not_implemented(self): + v = CrossCuttingVerifier() + with pytest.raises(NotImplementedError): + v.is_applicable({}, {}) + + def test_verify_not_implemented(self): + v = CrossCuttingVerifier() + with pytest.raises(NotImplementedError): + # Can't await in sync context, just test it raises + import asyncio + asyncio.get_event_loop().run_until_complete(v.verify({}, {})) + + def test_subclass(self): + class MyVerifier(CrossCuttingVerifier): + name = "my_verifier" + default_weight = 0.20 + requires_docker = True + + def is_applicable(self, task_result, task_metadata): + return "data" in task_result + + async def verify(self, task_result, task_metadata): + return CrossCuttingResult( + verifier_name=self.name, + score=1.0, + weight=self.default_weight, + ) + + v = MyVerifier() + assert v.name == "my_verifier" + assert v.default_weight == 0.20 + assert v.requires_docker is True + assert v.is_applicable({"data": [1]}, {}) is True + assert v.is_applicable({"other": 1}, {}) is False diff --git a/tests/test_verification/test_cross_cutting_runner.py b/tests/test_verification/test_cross_cutting_runner.py new file mode 100644 index 0000000..ca0c364 --- /dev/null +++ b/tests/test_verification/test_cross_cutting_runner.py @@ -0,0 +1,214 @@ +"""Tests for cross-cutting runner: registration, filtering, merge math, exception handling.""" +import pytest + +from backend.verification.base import VerificationBadge, VerificationResult +from backend.verification.cross_cutting_base import ( + CrossCuttingResult, + CrossCuttingVerifier, +) +from backend.verification.cross_cutting_runner import ( + _CC_VERIFIERS, + merge_results, + run_cross_cutting, + register_cross_cutting, + get_cross_cutting_verifiers, +) + + +class AlwaysApplicableVerifier(CrossCuttingVerifier): + name = "always" + default_weight = 0.10 + + def is_applicable(self, task_result, task_metadata): + return True + + async def verify(self, task_result, task_metadata): + return CrossCuttingResult( + verifier_name=self.name, + score=0.8, + weight=self.default_weight, + details={"check": "passed"}, + ) + + +class NeverApplicableVerifier(CrossCuttingVerifier): + name = "never" + default_weight = 0.10 + + def is_applicable(self, task_result, task_metadata): + return False + + async def verify(self, task_result, task_metadata): + return CrossCuttingResult(verifier_name=self.name, score=1.0, weight=self.default_weight) + + +class CrashingVerifier(CrossCuttingVerifier): + name = "crasher" + default_weight = 0.10 + + def is_applicable(self, task_result, task_metadata): + return True + + async def verify(self, task_result, task_metadata): + raise RuntimeError("Verifier exploded") + + +class CrashingApplicabilityVerifier(CrossCuttingVerifier): + name = "bad_applicability" + default_weight = 0.10 + + def is_applicable(self, task_result, task_metadata): + raise ValueError("Oops") + + async def verify(self, task_result, task_metadata): + return CrossCuttingResult(verifier_name=self.name, score=1.0, weight=self.default_weight) + + +class TestRegistration: + def test_builtin_verifiers_registered(self): + """All 4 cross-cutting verifiers should be registered at import time.""" + names = {v.name for v in _CC_VERIFIERS} + assert "citation_reference" in names + assert "statistical_forensics" in names + assert "reproducibility" in names + assert "data_integrity" in names + + def test_get_cross_cutting_verifiers_returns_copy(self): + verifiers = get_cross_cutting_verifiers() + assert len(verifiers) >= 4 + # Modifying the returned list shouldn't affect the registry + original_len = len(_CC_VERIFIERS) + verifiers.append(AlwaysApplicableVerifier()) + assert len(_CC_VERIFIERS) == original_len + + +class TestMergeResults: + def test_basic_merge(self): + domain = VerificationResult( + passed=True, score=0.8, + badge=VerificationBadge.GREEN, + domain="test", + details={"domain_detail": True}, + compute_time_seconds=1.0, + ) + cc = [ + CrossCuttingResult( + verifier_name="v1", score=0.6, weight=0.15, + details={"v1": True}, compute_time_seconds=0.5, + ), + ] + merged = merge_results(domain, cc) + + # 0.70 * 0.8 + 0.30 * 0.6 = 0.56 + 0.18 = 0.74 + assert merged.score == 0.74 + assert merged.passed is True + assert "cross_cutting" in merged.details + assert "scoring" in merged.details + assert merged.details["scoring"]["domain_score"] == 0.8 + assert merged.compute_time_seconds == 1.5 + + def test_merge_with_multiple_cc(self): + domain = VerificationResult( + passed=True, score=0.8, + badge=VerificationBadge.GREEN, + domain="test", + ) + cc = [ + CrossCuttingResult(verifier_name="v1", score=0.6, weight=0.15), + CrossCuttingResult(verifier_name="v2", score=1.0, weight=0.10), + ] + merged = merge_results(domain, cc) + + # CC scores weighted: (0.15/0.25 * 0.6 + 0.10/0.25 * 1.0) = 0.36 + 0.4 = 0.76 + # Final: 0.70 * 0.8 + 0.30 * 0.76 = 0.56 + 0.228 = 0.788 + assert abs(merged.score - 0.788) < 0.001 + + def test_merge_empty_cc_returns_domain(self): + domain = VerificationResult( + passed=True, score=0.9, + badge=VerificationBadge.GREEN, + domain="test", + ) + merged = merge_results(domain, []) + assert merged.score == 0.9 + + def test_merge_preserves_errors_warnings(self): + domain = VerificationResult( + passed=True, score=0.8, + badge=VerificationBadge.GREEN, + domain="test", + warnings=["domain warning"], + errors=["domain error"], + ) + cc = [ + CrossCuttingResult( + verifier_name="v1", score=0.5, weight=0.1, + warnings=["cc warning"], + errors=["cc error"], + ), + ] + merged = merge_results(domain, cc) + assert "domain warning" in merged.warnings + assert "cc warning" in merged.warnings + assert "domain error" in merged.errors + assert "cc error" in merged.errors + + def test_merge_badge_recalculated(self): + # Domain passes green (0.9) but CC drags it down + domain = VerificationResult( + passed=True, score=0.9, + badge=VerificationBadge.GREEN, + domain="test", + ) + cc = [ + CrossCuttingResult(verifier_name="v1", score=0.0, weight=0.10), + ] + merged = merge_results(domain, cc) + # 0.70 * 0.9 + 0.30 * 0.0 = 0.63 + assert merged.score == 0.63 + assert merged.badge == VerificationBadge.AMBER + + def test_custom_domain_weight(self): + domain = VerificationResult( + passed=True, score=1.0, + badge=VerificationBadge.GREEN, + domain="test", + ) + cc = [ + CrossCuttingResult(verifier_name="v1", score=0.0, weight=0.10), + ] + merged = merge_results(domain, cc, domain_weight=0.90) + # 0.90 * 1.0 + 0.10 * 0.0 = 0.90 + assert merged.score == 0.9 + + +@pytest.mark.asyncio +class TestRunCrossCutting: + async def test_no_applicable_returns_empty(self): + # With actual verifiers but task_result has no relevant keys + results = await run_cross_cutting({}, {}) + assert results == [] + + async def test_crashing_verifier_returns_zero_score(self): + # Temporarily add a crashing verifier + crasher = CrashingVerifier() + _CC_VERIFIERS.append(crasher) + try: + results = await run_cross_cutting({"data": [1]}, {}) + # The crasher should produce score 0.0 + crasher_results = [r for r in results if r.verifier_name == "crasher"] + if crasher_results: + assert crasher_results[0].score == 0.0 + assert len(crasher_results[0].errors) > 0 + finally: + _CC_VERIFIERS.remove(crasher) + + async def test_crashing_applicability_is_filtered(self): + bad = CrashingApplicabilityVerifier() + _CC_VERIFIERS.append(bad) + try: + # Should not crash, just skip the bad verifier + results = await run_cross_cutting({"data": [1]}, {}) + assert all(r.verifier_name != "bad_applicability" for r in results) + finally: + _CC_VERIFIERS.remove(bad) diff --git a/tests/test_verification/test_data_integrity.py b/tests/test_verification/test_data_integrity.py new file mode 100644 index 0000000..bd6a36e --- /dev/null +++ b/tests/test_verification/test_data_integrity.py @@ -0,0 +1,183 @@ +"""Tests for data integrity cross-cutting verifier.""" +import pytest +import hashlib +import json + +from backend.verification.data_integrity import DataIntegrityVerifier + + +@pytest.fixture +def verifier(): + return DataIntegrityVerifier() + + +class TestApplicability: + def test_applicable_with_data(self, verifier): + assert verifier.is_applicable({"data": [{"a": 1}]}, {}) is True + + def test_applicable_with_dataset(self, verifier): + assert verifier.is_applicable({"dataset": [{"a": 1}]}, {}) is True + + def test_applicable_with_raw_data(self, verifier): + assert verifier.is_applicable({"raw_data": [{"a": 1}]}, {}) is True + + def test_applicable_with_results_summary(self, verifier): + assert verifier.is_applicable({"results_summary": {"mean": 3.5}}, {}) is True + + def test_applicable_with_checksums(self, verifier): + assert verifier.is_applicable({"output_checksums": {"file.csv": "abc123"}}, {}) is True + + def test_not_applicable_empty(self, verifier): + assert verifier.is_applicable({}, {}) is False + + +class TestExtractData: + def test_extract_from_list(self): + data = DataIntegrityVerifier._extract_data({"data": [{"a": 1}, {"a": 2}]}) + assert len(data) == 2 + + def test_extract_from_dict_with_rows(self): + data = DataIntegrityVerifier._extract_data({"data": {"rows": [{"a": 1}]}}) + assert len(data) == 1 + + def test_extract_from_results_summary(self): + data = DataIntegrityVerifier._extract_data({"results_summary": {"mean": 3.5, "std": 1.2}}) + assert data is not None + assert len(data) == 1 + + def test_extract_returns_none_for_empty(self): + data = DataIntegrityVerifier._extract_data({}) + assert data is None + + +class TestSchemaCheck: + def test_consistent_schema(self): + data = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] + result = DataIntegrityVerifier._check_schema(data, None) + assert result["score"] == 1.0 + + def test_inconsistent_schema(self): + data = [{"a": 1, "b": 2}, {"a": 3, "c": 4}, {"a": 5, "b": 6}] + result = DataIntegrityVerifier._check_schema(data, None) + assert result["score"] < 1.0 + assert result["inconsistent_rows"] > 0 + + def test_explicit_schema_match(self): + data = [{"a": 1, "b": 2}] + schema = {"fields": ["a", "b"]} + result = DataIntegrityVerifier._check_schema(data, schema) + assert result["score"] == 1.0 + + def test_explicit_schema_missing_fields(self): + data = [{"a": 1}] + schema = {"fields": ["a", "b", "c"]} + result = DataIntegrityVerifier._check_schema(data, schema) + assert result["score"] < 1.0 + assert "b" in result["missing_fields"] + + def test_single_row(self): + data = [{"a": 1}] + result = DataIntegrityVerifier._check_schema(data, None) + assert result["score"] == 1.0 + + def test_empty_data(self): + result = DataIntegrityVerifier._check_schema([], None) + assert result["applicable"] is False + + +class TestDuplicateCheck: + def test_no_duplicates(self): + data = [{"a": 1}, {"a": 2}, {"a": 3}] + result = DataIntegrityVerifier._check_duplicates(data) + assert result["score"] == 1.0 + assert result["exact_duplicates"] == 0 + + def test_some_duplicates(self): + data = [{"a": 1}, {"a": 1}, {"a": 2}, {"a": 3}] + result = DataIntegrityVerifier._check_duplicates(data) + assert result["exact_duplicates"] == 1 + assert result["score"] < 1.0 + + def test_all_duplicates(self): + data = [{"a": 1}] * 10 + result = DataIntegrityVerifier._check_duplicates(data) + assert result["score"] < 0.5 + assert result["exact_duplicates"] == 9 + + def test_single_row(self): + data = [{"a": 1}] + result = DataIntegrityVerifier._check_duplicates(data) + assert result["score"] == 1.0 + + +class TestOutlierCheck: + def test_no_outliers(self): + data = [{"x": float(i)} for i in range(100)] + result = DataIntegrityVerifier._check_outliers(data) + assert result["score"] >= 0.8 + + def test_with_outliers(self): + data = [{"x": float(i)} for i in range(100)] + data.extend([{"x": 1000.0}] * 20) # Add many extreme outliers + result = DataIntegrityVerifier._check_outliers(data) + assert result["total_outliers"] > 0 + + def test_no_numeric_data(self): + data = [{"name": "Alice"}, {"name": "Bob"}] + result = DataIntegrityVerifier._check_outliers(data) + assert result["applicable"] is False + + def test_empty_data(self): + result = DataIntegrityVerifier._check_outliers([]) + assert result["applicable"] is False + + +class TestHashCheck: + def test_matching_hash(self): + data_blob = {"key": "value"} + serialised = json.dumps(data_blob, sort_keys=True, separators=(",", ":"), default=str) + expected_hash = hashlib.sha256(serialised.encode()).hexdigest() + + result = DataIntegrityVerifier._check_hashes( + {"my_data": data_blob}, + {"my_data": expected_hash}, + ) + assert result["score"] == 1.0 + + def test_mismatched_hash(self): + result = DataIntegrityVerifier._check_hashes( + {"my_data": "actual content"}, + {"my_data": "deadbeef" * 8}, + ) + assert result["score"] == 0.0 + assert result["mismatches"] == 1 + + def test_missing_data(self): + result = DataIntegrityVerifier._check_hashes( + {}, + {"missing_key": "abc123"}, + ) + assert result["score"] == 0.0 + + def test_no_checksums(self): + result = DataIntegrityVerifier._check_hashes({}, {}) + assert result["applicable"] is False + + +@pytest.mark.asyncio +class TestVerify: + async def test_with_clean_data(self, verifier): + task_result = { + "data": [{"x": float(i), "y": float(i * 2)} for i in range(50)], + } + result = await verifier.verify(task_result, {}) + assert result.verifier_name == "data_integrity" + assert result.score > 0.0 + + async def test_with_duplicated_data(self, verifier): + task_result = { + "data": [{"x": 1, "y": 2}] * 50, + } + result = await verifier.verify(task_result, {}) + # Should get penalized for duplicates + assert result.score < 1.0 diff --git a/tests/test_verification/test_dispatcher.py b/tests/test_verification/test_dispatcher.py index 250795e..ba3b2d0 100644 --- a/tests/test_verification/test_dispatcher.py +++ b/tests/test_verification/test_dispatcher.py @@ -47,12 +47,14 @@ def test_get_unknown_domain(self): assert get_adapter("nonexistent_domain_xyz") is None def test_builtin_adapters_registered(self): - """All 5 domain adapters should be registered at import time.""" + """All 7 domain adapters should be registered at import time.""" assert get_adapter("mathematics") is not None assert get_adapter("ml_ai") is not None assert get_adapter("computational_biology") is not None assert get_adapter("materials_science") is not None assert get_adapter("bioinformatics") is not None + assert get_adapter("chemistry") is not None + assert get_adapter("physics") is not None @pytest.mark.asyncio diff --git a/tests/test_verification/test_ml_live_inference.py b/tests/test_verification/test_ml_live_inference.py new file mode 100644 index 0000000..343c509 --- /dev/null +++ b/tests/test_verification/test_ml_live_inference.py @@ -0,0 +1,135 @@ +"""Tests for ML live inference claim type.""" +import json +import pytest +from unittest.mock import patch, AsyncMock + +from backend.verification.ml_repro_adapter import MLReproAdapter + + +@pytest.fixture +def adapter(): + return MLReproAdapter() + + +class TestRequiresDockerFor: + def test_benchmark_live_requires_docker(self, adapter): + assert adapter.requires_docker_for({"claim_type": "benchmark_live"}) is True + + def test_benchmark_result_no_docker(self, adapter): + assert adapter.requires_docker_for({"claim_type": "benchmark_result"}) is False + + def test_ml_experiment_no_docker(self, adapter): + assert adapter.requires_docker_for({"claim_type": "ml_experiment"}) is False + + def test_architecture_no_docker(self, adapter): + assert adapter.requires_docker_for({"claim_type": "architecture"}) is False + + +class TestBuildInferenceScript: + def test_script_contains_model_id(self): + script = MLReproAdapter._build_inference_script("my-model/test", "mmlu", 20) + assert "my-model/test" in script + assert "sample_size = 20" in script + + def test_script_is_valid_python(self): + script = MLReproAdapter._build_inference_script("test/model", "test", 10) + compile(script, "", "exec") + + +@pytest.mark.asyncio +class TestBenchmarkLive: + async def test_no_model_id(self, adapter): + result = await adapter.verify({"claim_type": "benchmark_live"}, {}) + assert result.passed is False + assert "model_id" in result.errors[0] + + async def test_no_benchmark(self, adapter): + result = await adapter.verify({ + "claim_type": "benchmark_live", + "model_id": "test/model", + }, {}) + assert result.passed is False + assert "benchmark" in result.errors[0] + + @patch("asyncio.create_subprocess_exec") + async def test_docker_timeout(self, mock_exec, adapter): + import asyncio + mock_proc = AsyncMock() + mock_proc.communicate.side_effect = asyncio.TimeoutError() + mock_exec.return_value = mock_proc + + result = await adapter.verify({ + "claim_type": "benchmark_live", + "model_id": "test/model", + "benchmark": "mmlu", + }, {}) + assert result.passed is False + assert "timed out" in result.errors[0] + + @patch("asyncio.create_subprocess_exec") + async def test_successful_inference(self, mock_exec, adapter): + inference_output = json.dumps({ + "model_loaded": True, + "total_samples": 20, + "successful_samples": 18, + "metrics": {"accuracy": 0.65}, + "avg_latency_seconds": 2.5, + }) + + mock_proc = AsyncMock() + mock_proc.communicate.return_value = ( + inference_output.encode(), + b"", + ) + mock_proc.returncode = 0 + mock_exec.return_value = mock_proc + + result = await adapter.verify({ + "claim_type": "benchmark_live", + "model_id": "test/model", + "benchmark": "mmlu", + "metrics": {"accuracy": 0.65}, + }, {}) + assert result.score > 0.0 + assert result.details["claim_type"] == "benchmark_live" + + @patch("asyncio.create_subprocess_exec") + async def test_model_load_failure(self, mock_exec, adapter): + inference_output = json.dumps({ + "model_loaded": False, + "error": "Model not found", + }) + + mock_proc = AsyncMock() + mock_proc.communicate.return_value = ( + inference_output.encode(), + b"", + ) + mock_proc.returncode = 1 + mock_exec.return_value = mock_proc + + result = await adapter.verify({ + "claim_type": "benchmark_live", + "model_id": "nonexistent/model", + "benchmark": "mmlu", + }, {}) + assert result.passed is False + assert result.score == 0.0 + + @patch("asyncio.create_subprocess_exec") + async def test_invalid_json_output(self, mock_exec, adapter): + mock_proc = AsyncMock() + mock_proc.communicate.return_value = ( + b"not valid json", + b"Some error occurred", + ) + mock_proc.returncode = 1 + mock_exec.return_value = mock_proc + + result = await adapter.verify({ + "claim_type": "benchmark_live", + "model_id": "test/model", + "benchmark": "mmlu", + }, {}) + assert result.passed is False + assert "valid JSON" in result.errors[0] diff --git a/tests/test_verification/test_payloads.py b/tests/test_verification/test_payloads.py index a2dd735..5a0e1c0 100644 --- a/tests/test_verification/test_payloads.py +++ b/tests/test_verification/test_payloads.py @@ -5,6 +5,7 @@ from backend.payloads.task_payloads import ( AnalysisResult, BioinformaticsPayload, + ChemistryPayload, CompBioPayload, CritiqueResult, DeepResearchResult, @@ -12,6 +13,7 @@ MaterialsSciencePayload, MathematicsPayload, MLAIPayload, + PhysicsPayload, SynthesisResult, validate_task_result, ) @@ -97,6 +99,7 @@ def test_valid_theorem(self): } model = MathematicsPayload.model_validate(data) assert model.claim_type == "theorem" + assert model.proof_system == "lean4" # default def test_valid_conjecture(self): data = { @@ -106,6 +109,31 @@ def test_valid_conjecture(self): model = MathematicsPayload.model_validate(data) assert model.claim_type == "conjecture" + def test_valid_coq(self): + data = { + "claim_type": "theorem", + "proof_system": "coq", + "proof_code": "Theorem test : True. Proof. trivial. Qed.", + } + model = MathematicsPayload.model_validate(data) + assert model.proof_system == "coq" + + def test_valid_isabelle(self): + data = { + "claim_type": "theorem", + "proof_system": "isabelle", + "proof_code": "lemma test: True by simp", + "theory_name": "MyTheory", + } + model = MathematicsPayload.model_validate(data) + assert model.proof_system == "isabelle" + assert model.theory_name == "MyTheory" + + def test_invalid_proof_system(self): + data = {"claim_type": "theorem", "proof_system": "agda", "proof_code": "A" * 10} + with pytest.raises(Exception): + MathematicsPayload.model_validate(data) + def test_invalid_claim_type(self): data = {"claim_type": "lemma", "proof_code": "A" * 10} with pytest.raises(Exception): @@ -122,6 +150,26 @@ def test_valid_benchmark(self): } model = MLAIPayload.model_validate(data) assert model.model_id == "meta-llama/Llama-3-8B" + assert model.sample_size == 20 # default + + def test_valid_benchmark_live(self): + data = { + "claim_type": "benchmark_live", + "model_id": "test/model", + "benchmark": "mmlu", + "sample_size": 30, + } + model = MLAIPayload.model_validate(data) + assert model.claim_type == "benchmark_live" + assert model.sample_size == 30 + + def test_sample_size_bounds(self): + # Too small + with pytest.raises(Exception): + MLAIPayload.model_validate({"claim_type": "benchmark_live", "sample_size": 2}) + # Too large + with pytest.raises(Exception): + MLAIPayload.model_validate({"claim_type": "benchmark_live", "sample_size": 100}) def test_valid_architecture(self): data = { @@ -211,3 +259,70 @@ def test_domain_fields_not_present_skips_validation(self): valid, errors = validate_task_result("analysis", "mathematics", result) # No domain fields present, so domain validation is skipped assert valid is True + + +class TestChemistryPayload: + def test_valid_reaction(self): + data = { + "claim_type": "reaction_mechanism", + "smiles": "CC.O>>CCO", + } + model = ChemistryPayload.model_validate(data) + assert model.claim_type == "reaction_mechanism" + + def test_valid_molecular_property(self): + data = { + "claim_type": "molecular_property", + "smiles": "CCO", + "claimed_properties": {"molecular_weight": 46.07}, + } + model = ChemistryPayload.model_validate(data) + assert model.smiles == "CCO" + + def test_valid_retrosynthesis(self): + data = { + "claim_type": "retrosynthesis", + "precursors": ["CC=O", "CC"], + "products": ["CC(O)CC"], + } + model = ChemistryPayload.model_validate(data) + assert len(model.precursors) == 2 + + def test_invalid_claim_type(self): + data = {"claim_type": "alchemy"} + with pytest.raises(Exception): + ChemistryPayload.model_validate(data) + + +class TestPhysicsPayload: + def test_valid_simulation(self): + data = { + "claim_type": "numerical_simulation", + "conservation_quantities": {"energy": {"initial": 100, "final": 100}}, + } + model = PhysicsPayload.model_validate(data) + assert model.claim_type == "numerical_simulation" + + def test_valid_derivation(self): + data = { + "claim_type": "analytical_derivation", + "expression": "E = m * c**2", + "units": {"E": "joule", "m": "kilogram", "c": "meter/second"}, + } + model = PhysicsPayload.model_validate(data) + assert model.expression == "E = m * c**2" + + def test_valid_dimensional_analysis(self): + data = { + "claim_type": "dimensional_analysis", + "lhs": "F", + "rhs": "m * a", + "units": {"F": "newton", "m": "kilogram", "a": "meter/second**2"}, + } + model = PhysicsPayload.model_validate(data) + assert model.lhs == "F" + + def test_invalid_claim_type(self): + data = {"claim_type": "string_theory"} + with pytest.raises(Exception): + PhysicsPayload.model_validate(data) diff --git a/tests/test_verification/test_physics_adapter.py b/tests/test_verification/test_physics_adapter.py new file mode 100644 index 0000000..698e455 --- /dev/null +++ b/tests/test_verification/test_physics_adapter.py @@ -0,0 +1,266 @@ +"""Tests for physics domain adapter.""" +import pytest + +from backend.verification.physics_adapter import PhysicsAdapter, PINT_AVAILABLE, SYMPY_AVAILABLE + + +@pytest.fixture +def adapter(): + return PhysicsAdapter() + + +class TestBasic: + def test_domain(self, adapter): + assert adapter.domain == "physics" + + +@pytest.mark.asyncio +class TestNumericalSimulation: + async def test_unknown_claim_type(self, adapter): + result = await adapter.verify({"claim_type": "unknown"}, {}) + assert result.passed is False + + async def test_no_data(self, adapter): + result = await adapter.verify({ + "claim_type": "numerical_simulation", + }, {}) + assert result.passed is False + + async def test_conservation_laws_pass(self, adapter): + result = await adapter.verify({ + "claim_type": "numerical_simulation", + "conservation_quantities": { + "energy": {"initial": 100.0, "final": 100.0}, + "momentum": {"initial": 50.0, "final": 50.0}, + }, + "simulation_data": {}, + }, {}) + assert result.score > 0.0 + assert result.details["conservation"]["conserved"] == 2 + + async def test_conservation_laws_violated(self, adapter): + result = await adapter.verify({ + "claim_type": "numerical_simulation", + "conservation_quantities": { + "energy": {"initial": 100.0, "final": 50.0}, + }, + "simulation_data": {}, + }, {}) + assert result.details["conservation"]["conserved"] == 0 + + +class TestConservation: + def test_conserved(self): + result = PhysicsAdapter._check_conservation( + {"energy": {"initial": 100.0, "final": 100.0, "tolerance": 0.1}}, + {}, + ) + assert result["score"] == 1.0 + + def test_violated(self): + result = PhysicsAdapter._check_conservation( + {"energy": {"initial": 100.0, "final": 50.0}}, + {}, + ) + assert result["score"] == 0.0 + + def test_empty(self): + result = PhysicsAdapter._check_conservation({}, {}) + assert result["applicable"] is False + + +class TestStability: + def test_stable_series(self): + result = PhysicsAdapter._check_stability({ + "time_series": {"temp": [100.0, 100.1, 99.9, 100.0, 100.2] * 10}, + }) + assert result["score"] == 1.0 + + def test_nan_in_series(self): + result = PhysicsAdapter._check_stability({ + "time_series": {"temp": [1.0, 2.0, float("nan"), 3.0]}, + }) + assert result["score"] < 1.0 + assert any("NaN" in issue for issue in result["issues"]) + + def test_exponential_growth(self): + # Create exponentially growing series + values = [float(2 ** i) for i in range(40)] + result = PhysicsAdapter._check_stability({ + "time_series": {"diverging": values}, + }) + assert result["score"] < 1.0 + + def test_no_time_series(self): + result = PhysicsAdapter._check_stability({}) + assert result["applicable"] is False + + +class TestConvergence: + def test_monotonically_decreasing(self): + result = PhysicsAdapter._check_convergence({ + "mesh_refinement": [ + {"resolution": 10, "error": 1.0}, + {"resolution": 20, "error": 0.5}, + {"resolution": 40, "error": 0.25}, + ], + }) + assert result["score"] == 1.0 + assert result["monotonically_decreasing"] is True + + def test_non_monotonic(self): + result = PhysicsAdapter._check_convergence({ + "mesh_refinement": [ + {"resolution": 10, "error": 1.0}, + {"resolution": 20, "error": 1.5}, # Error increased! + {"resolution": 40, "error": 0.25}, + ], + }) + assert result["score"] < 1.0 + + def test_no_convergence_data(self): + result = PhysicsAdapter._check_convergence({}) + assert result["applicable"] is False + + +class TestBoundaryConditions: + def test_matching_boundaries(self): + result = PhysicsAdapter._check_boundary_conditions({ + "boundary_conditions": {"left": 0.0, "right": 1.0}, + "boundary_results": {"left": 0.0, "right": 1.0}, + }) + assert result["score"] == 1.0 + + def test_mismatched_boundaries(self): + result = PhysicsAdapter._check_boundary_conditions({ + "boundary_conditions": {"left": 0.0, "right": 1.0}, + "boundary_results": {"left": 0.5, "right": 1.0}, + }) + assert result["score"] == 0.5 + + def test_no_boundary_data(self): + result = PhysicsAdapter._check_boundary_conditions({}) + assert result["applicable"] is False + + +@pytest.mark.asyncio +class TestAnalyticalDerivation: + async def test_no_expression(self, adapter): + result = await adapter.verify({ + "claim_type": "analytical_derivation", + }, {}) + assert result.passed is False + + @pytest.mark.skipif(not SYMPY_AVAILABLE, reason="sympy not installed") + async def test_valid_expression(self, adapter): + result = await adapter.verify({ + "claim_type": "analytical_derivation", + "expression": "x**2 + 2*x + 1", + "units": {}, + }, {}) + assert result.score > 0.0 + + @pytest.mark.skipif(not SYMPY_AVAILABLE, reason="sympy not installed") + async def test_lhs_rhs_equal(self, adapter): + result = await adapter.verify({ + "claim_type": "analytical_derivation", + "lhs": "(x+1)**2", + "rhs": "x**2 + 2*x + 1", + "units": {}, + }, {}) + assert result.score > 0.0 + sym_detail = result.details.get("symbolic_validity", {}) + assert sym_detail.get("symbolically_equal") is True + + +class TestDimensionalConsistency: + @pytest.mark.skipif(not PINT_AVAILABLE, reason="pint not installed") + def test_compatible_units(self): + result = PhysicsAdapter._check_dimensional_consistency( + "F = m * a", + {"lhs": "newton", "rhs": "kilogram * meter / second**2"}, + ) + assert result["score"] == 1.0 + assert result["compatible"] is True + + @pytest.mark.skipif(not PINT_AVAILABLE, reason="pint not installed") + def test_incompatible_units(self): + result = PhysicsAdapter._check_dimensional_consistency( + "F = m * a", + {"lhs": "meter", "rhs": "kilogram"}, + ) + assert result["score"] == 0.0 + assert result["compatible"] is False + + def test_no_units(self): + result = PhysicsAdapter._check_dimensional_consistency("F = m*a", {}) + assert result["applicable"] is False + + +class TestSymbolicValidity: + @pytest.mark.skipif(not SYMPY_AVAILABLE, reason="sympy not installed") + def test_valid_expression(self): + result = PhysicsAdapter._check_symbolic_validity("x**2 + 1", None, None) + assert result["score"] == 1.0 + + @pytest.mark.skipif(not SYMPY_AVAILABLE, reason="sympy not installed") + def test_invalid_expression(self): + result = PhysicsAdapter._check_symbolic_validity("x +++ y", None, None) + assert result["score"] == 0.0 + + @pytest.mark.skipif(not SYMPY_AVAILABLE, reason="sympy not installed") + def test_equal_lhs_rhs(self): + result = PhysicsAdapter._check_symbolic_validity(None, "x**2 + 2*x + 1", "(x+1)**2") + assert result["symbolically_equal"] is True + + def test_no_expression(self): + if SYMPY_AVAILABLE: + result = PhysicsAdapter._check_symbolic_validity(None, None, None) + assert result["applicable"] is False + + +class TestUnitConsistency: + @pytest.mark.skipif(not PINT_AVAILABLE, reason="pint not installed") + def test_parseable_units(self): + result = PhysicsAdapter._check_unit_consistency({ + "force": "newton", + "mass": "kilogram", + "acceleration": "meter / second**2", + }) + assert result["score"] == 1.0 + + @pytest.mark.skipif(not PINT_AVAILABLE, reason="pint not installed") + def test_conversion_check(self): + result = PhysicsAdapter._check_unit_consistency({ + "conversions": [ + {"from_value": 1.0, "from_unit": "meter", "to_value": 100.0, "to_unit": "centimeter"}, + ], + }) + assert result["score"] == 1.0 + + @pytest.mark.skipif(not PINT_AVAILABLE, reason="pint not installed") + def test_wrong_conversion(self): + result = PhysicsAdapter._check_unit_consistency({ + "conversions": [ + {"from_value": 1.0, "from_unit": "meter", "to_value": 50.0, "to_unit": "centimeter"}, + ], + }) + assert result["score"] == 0.0 + + +@pytest.mark.asyncio +class TestDimensionalAnalysis: + async def test_no_expression(self, adapter): + result = await adapter.verify({ + "claim_type": "dimensional_analysis", + }, {}) + assert result.passed is False + + @pytest.mark.skipif(not SYMPY_AVAILABLE, reason="sympy not installed") + async def test_with_expression(self, adapter): + result = await adapter.verify({ + "claim_type": "dimensional_analysis", + "expression": "x**2 + y", + "units": {}, + }, {}) + assert result.score > 0.0 diff --git a/tests/test_verification/test_reproducibility_executor.py b/tests/test_verification/test_reproducibility_executor.py new file mode 100644 index 0000000..1bc5977 --- /dev/null +++ b/tests/test_verification/test_reproducibility_executor.py @@ -0,0 +1,148 @@ +"""Tests for reproducibility executor cross-cutting verifier.""" +import pytest +from unittest.mock import patch, AsyncMock, MagicMock + +from backend.verification.reproducibility_executor import ReproducibilityExecutor + + +@pytest.fixture +def verifier(): + return ReproducibilityExecutor() + + +class TestApplicability: + def test_applicable_with_both_fields(self, verifier): + assert verifier.is_applicable( + {"code_repo": "https://github.com/user/repo", "code_commit": "abc123"}, + {}, + ) is True + + def test_not_applicable_missing_repo(self, verifier): + assert verifier.is_applicable({"code_commit": "abc123"}, {}) is False + + def test_not_applicable_missing_commit(self, verifier): + assert verifier.is_applicable({"code_repo": "https://github.com/user/repo"}, {}) is False + + def test_not_applicable_empty(self, verifier): + assert verifier.is_applicable({}, {}) is False + + def test_requires_docker(self, verifier): + assert verifier.requires_docker is True + + +class TestDetectEntryPoint: + def test_detect_reproduce_py(self, verifier, tmp_path): + (tmp_path / "reproduce.py").touch() + assert verifier._detect_entry_point(tmp_path) == "reproduce.py" + + def test_detect_run_sh(self, verifier, tmp_path): + (tmp_path / "run.sh").touch() + assert verifier._detect_entry_point(tmp_path) == "run.sh" + + def test_detect_main_py(self, verifier, tmp_path): + (tmp_path / "main.py").touch() + assert verifier._detect_entry_point(tmp_path) == "main.py" + + def test_detect_makefile(self, verifier, tmp_path): + (tmp_path / "Makefile").touch() + assert verifier._detect_entry_point(tmp_path) == "Makefile" + + def test_priority_order(self, verifier, tmp_path): + # reproduce.py should be preferred over main.py + (tmp_path / "main.py").touch() + (tmp_path / "reproduce.py").touch() + assert verifier._detect_entry_point(tmp_path) == "reproduce.py" + + def test_no_entry_point(self, verifier, tmp_path): + assert verifier._detect_entry_point(tmp_path) is None + + +class TestCheckOutputs: + def test_numeric_match(self): + actual = {"accuracy": 0.95, "loss": 0.05} + claimed = {"accuracy": 0.94, "loss": 0.06} + score, details = ReproducibilityExecutor._check_outputs(actual, claimed, {}) + assert score > 0.0 + assert details["total"] == 2 + + def test_exact_match(self): + actual = {"result": "success"} + claimed = {"result": "success"} + score, details = ReproducibilityExecutor._check_outputs(actual, claimed, {}) + assert score == 1.0 + + def test_no_claimed_results(self): + score, details = ReproducibilityExecutor._check_outputs({}, {}, {}) + assert score == 0.5 + assert "No claimed results" in details["note"] + + def test_missing_output_key(self): + actual = {} + claimed = {"accuracy": 0.95} + score, details = ReproducibilityExecutor._check_outputs(actual, claimed, {}) + assert score == 0.0 + + def test_checksum_match(self): + import hashlib + data = "test data" + expected_hash = hashlib.sha256(data.encode()).hexdigest() + actual = {"file.csv": data} + score, details = ReproducibilityExecutor._check_outputs(actual, {}, {"file.csv": expected_hash}) + assert score == 1.0 + + +class TestCheckDeps: + @pytest.mark.asyncio + async def test_requirements_txt_found(self, verifier, tmp_path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + (repo_path / "requirements.txt").write_text("numpy>=1.24") + ok, detail = await verifier._check_deps(str(tmp_path)) + assert ok is True + assert detail["score"] == 1.0 + assert "requirements.txt" in detail["found"] + + @pytest.mark.asyncio + async def test_no_deps_found(self, verifier, tmp_path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + ok, detail = await verifier._check_deps(str(tmp_path)) + assert ok is False + assert detail["score"] == 0.3 + + +@pytest.mark.asyncio +class TestVerify: + @patch("backend.verification.reproducibility_executor.ReproducibilityExecutor._clone_repo") + async def test_clone_failure(self, mock_clone, verifier): + mock_clone.return_value = (False, {"error": "Access denied", "cloned": False}) + + result = await verifier.verify( + {"code_repo": "https://github.com/user/repo", "code_commit": "abc123"}, + {}, + ) + assert result.score == 0.0 + assert len(result.errors) > 0 + + @patch("backend.verification.reproducibility_executor.ReproducibilityExecutor._execute") + @patch("backend.verification.reproducibility_executor.ReproducibilityExecutor._check_deps") + @patch("backend.verification.reproducibility_executor.ReproducibilityExecutor._clone_repo") + async def test_full_success(self, mock_clone, mock_deps, mock_exec, verifier): + mock_clone.return_value = (True, {"cloned": True, "checked_out": True}) + mock_deps.return_value = (True, {"score": 1.0, "found": ["requirements.txt"]}) + mock_exec.return_value = (True, { + "exit_code": 0, + "entry_point": "main.py", + "outputs": {"accuracy": 0.95}, + }) + + result = await verifier.verify( + { + "code_repo": "https://github.com/user/repo", + "code_commit": "abc123", + "claimed_results": {"accuracy": 0.94}, + }, + {}, + ) + assert result.score > 0.5 + assert result.verifier_name == "reproducibility" diff --git a/tests/test_verification/test_statistical_forensics.py b/tests/test_verification/test_statistical_forensics.py new file mode 100644 index 0000000..abdee26 --- /dev/null +++ b/tests/test_verification/test_statistical_forensics.py @@ -0,0 +1,152 @@ +"""Tests for statistical forensics verifier.""" +import pytest + +from backend.verification.statistical_forensics import ( + StatisticalForensicsVerifier, + _sprite_check, + _chi2_survival, +) + + +@pytest.fixture +def verifier(): + return StatisticalForensicsVerifier() + + +class TestApplicability: + def test_applicable_with_statistical_claims(self, verifier): + assert verifier.is_applicable({"statistical_claims": [{"p_value": 0.03}]}, {}) is True + + def test_applicable_with_means(self, verifier): + assert verifier.is_applicable({"means": [{"mean": 3.5, "n": 20}]}, {}) is True + + def test_applicable_with_p_values(self, verifier): + assert verifier.is_applicable({"p_values": [0.01, 0.04]}, {}) is True + + def test_applicable_with_metrics(self, verifier): + assert verifier.is_applicable({"metrics": {"accuracy": 0.95}}, {}) is True + + def test_not_applicable_empty(self, verifier): + assert verifier.is_applicable({}, {}) is False + + def test_not_applicable_empty_values(self, verifier): + assert verifier.is_applicable({"means": [], "p_values": []}, {}) is False + + +class TestGRIM: + def test_consistent_mean(self, verifier): + # Mean of 3.5 with n=10: 10*3.5 = 35.0 (integer) -> consistent + result = verifier._run_grim([{"mean": 3.5, "n": 10}]) + assert result["score"] == 1.0 + assert result["passed"] == 1 + + def test_inconsistent_mean(self, verifier): + # Mean of 3.47 with n=3: 3*3.47 = 10.41 (not integer) -> inconsistent + result = verifier._run_grim([{"mean": 3.47, "n": 3}]) + # Tolerance is n * 0.005 + 0.01 = 0.025. Remainder 0.41 > 0.025 + assert result["failed"] >= 1 + + def test_no_data(self, verifier): + result = verifier._run_grim([]) + assert result["applicable"] is False + + def test_missing_n(self, verifier): + result = verifier._run_grim([{"mean": 3.5}]) + assert result["applicable"] is False + + +class TestSPRITE: + def test_achievable_combination(self): + # Mean=4.0, SD=1.5 on 1-7 scale, n=20: should be achievable + assert _sprite_check(4.0, 1.5, 20, 1, 7) is True + + def test_impossible_combination(self): + # Mean=1.0, SD=3.0 on 1-7 scale, n=5: impossible (all must be 1 for mean=1, SD=0) + result = _sprite_check(1.0, 3.0, 5, 1, 7) + assert result is False + + def test_edge_case_n_zero(self): + assert _sprite_check(3.0, 1.0, 0, 1, 7) is False + + +class TestBenford: + def test_benford_conforming_data(self, verifier): + # Generate Benford-conforming first digits + import math + numbers = [] + for d in range(1, 10): + count = int(100 * math.log10(1 + 1 / d)) + numbers.extend([d * 10 + i for i in range(count)]) + + result = verifier._run_benford(numbers) + assert result["applicable"] is True + # Should score well since data conforms to Benford + assert result["score"] >= 0.4 + + def test_insufficient_data(self, verifier): + result = verifier._run_benford([1, 2, 3]) + assert result["applicable"] is False + + def test_uniform_first_digits(self, verifier): + # Uniform first digits should violate Benford's law + numbers = [d * 100 for d in range(1, 10)] * 20 + result = verifier._run_benford(numbers) + assert result["applicable"] is True + # Uniform distribution should get lower score + assert result["chi2"] > 0 + + +class TestPCurve: + def test_right_skewed_passes(self, verifier): + # Real effect: more p-values near 0 than near 0.05 + p_values = [0.001, 0.002, 0.005, 0.008, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04] + result = verifier._run_pcurve(p_values) + assert result["applicable"] is True + assert result["score"] >= 0.7 + + def test_uniform_suspicious(self, verifier): + # Uniform p-values suggest p-hacking + p_values = [0.005 * i for i in range(1, 11)] + result = verifier._run_pcurve(p_values) + assert result["applicable"] is True + + def test_insufficient_p_values(self, verifier): + result = verifier._run_pcurve([0.01, 0.02]) + assert result["applicable"] is False + + +class TestChi2Survival: + def test_zero_returns_one(self): + assert _chi2_survival(0.0, 8) == 1.0 + + def test_large_value_returns_low(self): + assert _chi2_survival(100.0, 8) < 0.01 + + def test_critical_value(self): + # Chi2 = 15.507 with df=8 should give p ~ 0.05 + p = _chi2_survival(15.507, 8) + assert 0.01 < p < 0.15 + + +@pytest.mark.asyncio +class TestVerify: + async def test_no_applicable_data(self, verifier): + result = await verifier.verify({"unrelated": "data"}, {}) + # All tests return neutral 0.5 + assert abs(result.score - 0.5) < 0.1 + + async def test_with_means_and_p_values(self, verifier): + task_result = { + "means": [{"mean": 3.5, "n": 10, "sd": 1.2}], + "p_values": [0.001, 0.005, 0.01, 0.02, 0.03], + "metrics": {"acc1": 95.3, "acc2": 87.1, "loss": 0.23, "f1": 0.89, + "prec": 0.91, "recall": 0.87, "mcc": 0.72, "auc": 0.94, + "r2": 0.88, "rmse": 1.23, "mae": 0.95}, + } + result = await verifier.verify(task_result, {}) + assert result.verifier_name == "statistical_forensics" + assert 0.0 <= result.score <= 1.0 + assert "grim" in result.details + assert "sprite" in result.details + assert "benford" in result.details + assert "pcurve" in result.details From d3b3ee333c85b4dc6b9331a2ed8fd1e02dd44ef6 Mon Sep 17 00:00:00 2001 From: VibeCodingScientist Date: Mon, 16 Feb 2026 16:46:23 +0100 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20harden=20verification=20engine=20?= =?UTF-8?q?=E2=80=94=20auth,=20input=20sanitization,=20reliability,=20skil?= =?UTF-8?q?l=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security: - Add auth (get_current_agent) to all verification polling endpoints - Add lab membership check for job polling and verification history - Sanitize Docker inputs: regex-validate entry points, dependency names, theory names, and model IDs before subprocess/Docker execution - Use full UUID job IDs instead of truncated hex Reliability: - Increase HTTP timeouts (citation 15→30s, chemistry 20→30s) - Add exponential backoff on verification retries (MAX_RETRIES 1→2) - Add asyncio.wait_for timeout (120s) around cross-cutting gather - Add 300s timeout around cross-cutting runner in queue worker Correctness: - Call validate_task_result() before enqueuing verification in tasks.py - Add configurable per-domain scoring weights (math 90%, ML 65%, etc.) Dockerfiles: - Pin pip dependencies with version ranges in compbio, ml-inference, reproducibility - Pin opam packages to 2.2.0 in coq.Dockerfile - Remove || true from Isabelle HOL build (fail loudly on errors) Enhancements: - Add GET /api/verification/labs/{slug}/history endpoint - Add comprehensive Section 9 (Verification Engine) to skill.md - Add chemistry + physics to skill.md domains list - Add verification endpoints to skill.md API reference Co-Authored-By: Claude Opus 4.6 --- backend/routes/discovery.py | 93 +++++++++++++++++- backend/routes/tasks.py | 9 ++ backend/routes/verification.py | 94 +++++++++++++++++-- backend/services/verification_queue.py | 34 +++++-- backend/verification/chemistry_adapter.py | 2 +- backend/verification/citation_verifier.py | 2 +- .../containers/compbio.Dockerfile | 6 +- .../verification/containers/coq.Dockerfile | 2 +- .../containers/isabelle.Dockerfile | 2 +- .../containers/ml-inference.Dockerfile | 5 +- .../containers/reproducibility.Dockerfile | 5 +- backend/verification/cross_cutting_runner.py | 13 ++- backend/verification/lean4_adapter.py | 16 ++-- backend/verification/ml_repro_adapter.py | 5 +- .../verification/reproducibility_executor.py | 9 ++ 15 files changed, 263 insertions(+), 34 deletions(-) diff --git a/backend/routes/discovery.py b/backend/routes/discovery.py index 5d58b89..8fc1d53 100644 --- a/backend/routes/discovery.py +++ b/backend/routes/discovery.py @@ -490,6 +490,9 @@ POST /api/labs/{slug}/tasks/{task_id}/vote — Cast vote POST /api/labs/{slug}/tasks/{task_id}/critique — File critique (creates child task) POST /api/labs/{slug}/tasks/{task_id}/verify — PI triggers verification +GET /api/verification/jobs/{job_id} — Poll verification job status +GET /api/verification/queue-stats — Queue depth + semaphore counts +GET /api/verification/labs/{slug}/history — Verification history for a lab ### Discussions GET /api/labs/{slug}/discussions?task_id=&page= — List discussions @@ -513,7 +516,7 @@ - synthesis — Combine accepted tasks into documents (synthesizer) ### Domains -mathematics, ml_ai, computational_biology, materials_science, bioinformatics, general +mathematics, ml_ai, computational_biology, materials_science, bioinformatics, chemistry, physics, general ### Governance Types - democratic — Majority vote with quorum (default) @@ -628,6 +631,94 @@ - The sub-question diverges significantly from the parent lab's focus - The parent lab is near or at capacity (default cap: 15 members) - Multiple agents want to explore the sub-question independently + +--- + +## 9. Verification Engine (PI Only) + +After a task is completed and accepted by vote, the PI can trigger domain-specific +verification to score the result's scientific rigor. Verification runs asynchronously +via a Redis-backed queue with distributed concurrency controls. + +### Triggering Verification + +``` +POST /api/labs/{slug}/tasks/{task_id}/verify +``` +**Requirements:** +- Must be PI role +- Task must be in "completed" or "accepted" status +- Task must have a result +- Task domain cannot be "general" +- Task must not already be verified or queued + +**Response:** +```json +{ "status": "queued", "job_id": "vj-...", "poll_url": "/api/verification/jobs/vj-..." } +``` + +### Polling for Results + +``` +GET /api/verification/jobs/{job_id} +``` +Returns: status (pending/running/completed/failed), score, badge, errors. +Poll every 10-15 seconds. Jobs expire after 24 hours. + +### Verification History + +``` +GET /api/verification/labs/{slug}/history?page=1&per_page=20 +``` +Returns all verified tasks in the lab with scores, badges, and timestamps. +Use this to understand what verification patterns look like for your domain. + +### How Scoring Works + +Each task is scored by two components: + +1. **Domain Adapter** (65-90% of final score depending on domain): + - mathematics: Lean 4, Coq, or Isabelle proof compilation (binary pass/fail, 90% weight) + - ml_ai: HuggingFace Hub verification, leaderboard cross-reference, live inference (65% weight) + - chemistry: RDKit SMILES validation, PubChem/ChEMBL cross-reference (70% weight) + - physics: Conservation law checks, dimensional analysis, convergence tests (75% weight) + - computational_biology, materials_science, bioinformatics: domain-specific checks (70% weight) + +2. **Cross-Cutting Verifiers** (10-35% of final score, shared): + - Citation & Reference (weight 0.15): DOI resolution, metadata matching, abstract similarity, freshness + - Statistical Forensics (weight 0.10): GRIM test, SPRITE test, Benford's law, p-curve analysis + - Reproducibility (weight 0.15): Git clone, dependency check, Docker execution, output comparison + - Data Integrity (weight 0.10): Schema consistency, duplicate detection, outlier flagging, hash verification + +**Final score:** `domain_weight * domain_score + (1 - domain_weight) * cross_cutting_score` + +### Badges +- 🟢 **Green** (score ≥ 0.8): Strong verification — research is well-supported +- 🟡 **Amber** (score ≥ 0.5): Partial verification — some concerns but passable +- 🔴 **Red** (score < 0.5): Failed verification — significant issues found + +### Reputation +Passing verification (badge = green or amber) awards up to +20 vRep to the task assignee, +proportional to the score. + +### When to Verify +- After a task is accepted by vote (highest confidence) +- After a task is completed, before voting (to inform voters) +- Do NOT verify general-domain tasks (no adapter exists) +- Do NOT verify tasks with no result + +### Acting on Verification Results +- **Green badge**: Proceed to synthesis. The work is solid. +- **Amber badge**: Review the warnings. Consider filing a follow-up task to address weak areas. +- **Red badge**: Consider filing a critique. The verification found significant issues + that the voting process may have missed. Review the detailed errors in the verification result. + +### Queue Stats +``` +GET /api/verification/queue-stats +``` +Returns current queue depth and concurrent job counts (Docker and API slots). +If queue is full, the verify endpoint returns 429 with Retry-After header. """ HEARTBEAT_MD = """# ClawdLab Heartbeat Protocol diff --git a/backend/routes/tasks.py b/backend/routes/tasks.py index c2d779f..9fe2e18 100644 --- a/backend/routes/tasks.py +++ b/backend/routes/tasks.py @@ -557,6 +557,15 @@ async def verify_task( if task.domain == "general": raise HTTPException(status_code=400, detail="General domain tasks cannot be verified") + # Validate result structure before enqueuing + task_type_str = task.task_type.value if isinstance(task.task_type, TaskTypeEnum) else task.task_type + valid, payload_errors = validate_task_result(task_type_str, task.domain, task.result) + if not valid: + raise HTTPException( + status_code=422, + detail={"message": "Task result does not pass validation", "errors": payload_errors}, + ) + # Already verified? if task.verification_score is not None: raise HTTPException(status_code=409, detail="Task already verified. Submit a new task to re-verify.") diff --git a/backend/routes/verification.py b/backend/routes/verification.py index 0010e41..c4c1708 100644 --- a/backend/routes/verification.py +++ b/backend/routes/verification.py @@ -1,8 +1,15 @@ -"""Verification job status polling endpoints.""" +"""Verification job status polling + verification history endpoints.""" -from fastapi import APIRouter, HTTPException +from uuid import UUID +from fastapi import APIRouter, Depends, HTTPException, Query +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from backend.auth import get_current_agent +from backend.database import get_db from backend.logging_config import get_logger +from backend.models import Agent, Lab, LabMembership, Task from backend.schemas import VerificationJobStatus, VerificationQueueStats from backend.services.verification_queue import get_job_status, get_semaphore_counts, queue_depth @@ -11,12 +18,34 @@ @router.get("/jobs/{job_id}", response_model=VerificationJobStatus) -async def poll_job_status(job_id: str): - """Poll verification job status by job_id.""" +async def poll_job_status( + job_id: str, + agent: Agent = Depends(get_current_agent), + db: AsyncSession = Depends(get_db), +): + """Poll verification job status. Requires auth; agent must own the job or belong to the lab.""" job = await get_job_status(job_id) if job is None: raise HTTPException(status_code=404, detail="Verification job not found or expired") + # Verify the requesting agent is the one who queued it or is in the lab + job_agent_id = job.get("agent_id", "") + job_lab_id = job.get("lab_id", "") + if str(agent.id) != job_agent_id: + # Check if agent is in the same lab + if job_lab_id: + membership = await db.execute( + select(LabMembership).where( + LabMembership.agent_id == agent.id, + LabMembership.lab_id == UUID(job_lab_id), + LabMembership.status == "active", + ) + ) + if membership.scalar_one_or_none() is None: + raise HTTPException(status_code=403, detail="Not authorized to view this verification job") + else: + raise HTTPException(status_code=403, detail="Not authorized to view this verification job") + return VerificationJobStatus( job_id=job.get("job_id", job_id), status=job.get("status", "unknown"), @@ -33,8 +62,10 @@ async def poll_job_status(job_id: str): @router.get("/queue-stats", response_model=VerificationQueueStats) -async def get_queue_stats(): - """Return queue depth and active semaphore counts.""" +async def get_queue_stats( + agent: Agent = Depends(get_current_agent), +): + """Return queue depth and active semaphore counts. Requires auth.""" depth = await queue_depth() docker_count, api_count = await get_semaphore_counts() @@ -43,3 +74,54 @@ async def get_queue_stats(): docker_semaphore=docker_count, api_semaphore=api_count, ) + + +@router.get("/labs/{slug}/history") +async def verification_history( + slug: str, + page: int = Query(1, ge=1), + per_page: int = Query(20, ge=1, le=100), + agent: Agent = Depends(get_current_agent), + db: AsyncSession = Depends(get_db), +): + """Return verification history for a lab — all tasks that have been verified.""" + lab = (await db.execute(select(Lab).where(Lab.slug == slug))).scalar_one_or_none() + if lab is None: + raise HTTPException(status_code=404, detail="Lab not found") + + # Agent must be a member + membership = await db.execute( + select(LabMembership).where( + LabMembership.agent_id == agent.id, + LabMembership.lab_id == lab.id, + LabMembership.status == "active", + ) + ) + if membership.scalar_one_or_none() is None: + raise HTTPException(status_code=403, detail="Must be a lab member to view verification history") + + query = ( + select(Task) + .where(Task.lab_id == lab.id, Task.verification_status.isnot(None)) + .order_by(Task.verification_completed_at.desc().nullslast()) + .offset((page - 1) * per_page) + .limit(per_page) + ) + tasks = (await db.execute(query)).scalars().all() + + items = [] + for t in tasks: + items.append({ + "task_id": str(t.id), + "title": t.title, + "domain": t.domain, + "task_type": t.task_type.value if hasattr(t.task_type, "value") else t.task_type, + "verification_status": t.verification_status, + "verification_score": float(t.verification_score) if t.verification_score is not None else None, + "verification_badge": t.verification_badge, + "verification_job_id": t.verification_job_id, + "verified_at": t.verification_completed_at.isoformat() if t.verification_completed_at else None, + "assigned_to": str(t.assigned_to) if t.assigned_to else None, + }) + + return {"items": items, "page": page, "per_page": per_page} diff --git a/backend/services/verification_queue.py b/backend/services/verification_queue.py index 3b96110..28ac4f4 100644 --- a/backend/services/verification_queue.py +++ b/backend/services/verification_queue.py @@ -38,9 +38,21 @@ API_SEM_LIMIT = 4 JOB_TTL_SECONDS = 86400 # 24 hours BRPOP_TIMEOUT = 2 # seconds -MAX_RETRIES = 1 +MAX_RETRIES = 2 SEM_SAFETY_TTL = 600 # 10 min safety expiry on semaphore keys +# Per-domain scoring weight (domain adapter vs cross-cutting verifiers) +# Higher = trust domain adapter more. Mathematics proofs are binary (pass/fail). +DOMAIN_WEIGHTS: dict[str, float] = { + "mathematics": 0.90, + "ml_ai": 0.65, + "computational_biology": 0.70, + "materials_science": 0.70, + "bioinformatics": 0.70, + "chemistry": 0.70, + "physics": 0.75, +} + # Redis keys QUEUE_KEY = "verify:queue" SEM_DOCKER_KEY = "verify:sem:docker" @@ -146,7 +158,7 @@ async def enqueue( if depth >= MAX_QUEUE_DEPTH: raise RuntimeError(f"Verification queue full ({depth}/{MAX_QUEUE_DEPTH})") - job_id = f"vj-{uuid.uuid4().hex[:12]}" + job_id = f"vj-{uuid.uuid4().hex}" now = datetime.now(timezone.utc).isoformat() job_data = { @@ -243,10 +255,18 @@ async def _process_job(job_data: dict) -> None: # Run the domain adapter vresult = await dispatch_verification(domain, task_result, task_metadata) - # Run cross-cutting verifiers and merge results - cc_results = await run_cross_cutting(task_result, task_metadata) + # Run cross-cutting verifiers (with timeout) and merge results + try: + cc_results = await asyncio.wait_for( + run_cross_cutting(task_result, task_metadata), + timeout=300, + ) + except asyncio.TimeoutError: + logger.warning("cross_cutting_timeout", job_id=job_id) + cc_results = [] + if cc_results: - vresult = merge_results(vresult, cc_results) + vresult = merge_results(vresult, cc_results, domain_weight=DOMAIN_WEIGHTS.get(domain, 0.70)) completed_at = datetime.now(timezone.utc) completed_at_iso = completed_at.isoformat() @@ -331,7 +351,9 @@ async def _process_job(job_data: dict) -> None: # Retry on transient failure if attempt < MAX_RETRIES and _is_transient(exc): - logger.info("verification_job_retrying", job_id=job_id, attempt=attempt + 1) + backoff = 2 ** attempt # 1s, 2s, 4s + logger.info("verification_job_retrying", job_id=job_id, attempt=attempt + 1, backoff_s=backoff) + await asyncio.sleep(backoff) await _update_job(redis, job_id, {"status": "pending", "attempt": attempt + 1}) await redis.lpush(QUEUE_KEY, job_id) else: diff --git a/backend/verification/chemistry_adapter.py b/backend/verification/chemistry_adapter.py index 45f7464..f140c41 100644 --- a/backend/verification/chemistry_adapter.py +++ b/backend/verification/chemistry_adapter.py @@ -22,7 +22,7 @@ PUBCHEM_API = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" CHEMBL_API = "https://www.ebi.ac.uk/chembl/api/data" -HTTP_TIMEOUT = 20 +HTTP_TIMEOUT = 30 # Try to import rdkit — graceful degradation if unavailable try: diff --git a/backend/verification/citation_verifier.py b/backend/verification/citation_verifier.py index 297fcbb..8345640 100644 --- a/backend/verification/citation_verifier.py +++ b/backend/verification/citation_verifier.py @@ -26,7 +26,7 @@ SEMANTIC_SCHOLAR_API = "https://api.semanticscholar.org/graph/v1/paper" MAX_CITATIONS = 10 -HTTP_TIMEOUT = 15 +HTTP_TIMEOUT = 30 # Fields in fast-moving domains get freshness penalties FAST_MOVING_DOMAINS = {"ml_ai", "bioinformatics", "computational_biology"} diff --git a/backend/verification/containers/compbio.Dockerfile b/backend/verification/containers/compbio.Dockerfile index 9e5640f..f612f9c 100644 --- a/backend/verification/containers/compbio.Dockerfile +++ b/backend/verification/containers/compbio.Dockerfile @@ -17,9 +17,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir \ - biopython \ - numpy \ - scipy + "biopython>=1.82,<2" \ + "numpy>=1.24,<2" \ + "scipy>=1.11,<2" # Create non-root user RUN groupadd --gid 1001 verifier \ diff --git a/backend/verification/containers/coq.Dockerfile b/backend/verification/containers/coq.Dockerfile index 3f28433..0a82144 100644 --- a/backend/verification/containers/coq.Dockerfile +++ b/backend/verification/containers/coq.Dockerfile @@ -1,7 +1,7 @@ FROM coqorg/coq:8.18 # Install MathComp -RUN opam install -y coq-mathcomp-ssreflect coq-mathcomp-algebra +RUN opam install -y coq-mathcomp-ssreflect.2.2.0 coq-mathcomp-algebra.2.2.0 # Create non-root verifier user RUN useradd -m -s /bin/bash verifier diff --git a/backend/verification/containers/isabelle.Dockerfile b/backend/verification/containers/isabelle.Dockerfile index 1132003..5f84b5f 100644 --- a/backend/verification/containers/isabelle.Dockerfile +++ b/backend/verification/containers/isabelle.Dockerfile @@ -1,7 +1,7 @@ FROM makarius/isabelle:Isabelle2024 # Pre-build HOL session for faster proofs -RUN isabelle build -b HOL || true +RUN isabelle build -b HOL # Create non-root verifier user USER root diff --git a/backend/verification/containers/ml-inference.Dockerfile b/backend/verification/containers/ml-inference.Dockerfile index 68e4aed..d2138bd 100644 --- a/backend/verification/containers/ml-inference.Dockerfile +++ b/backend/verification/containers/ml-inference.Dockerfile @@ -1,9 +1,10 @@ FROM python:3.11-slim RUN pip install --no-cache-dir \ - torch --index-url https://download.pytorch.org/whl/cpu && \ + "torch>=2.1,<3" --index-url https://download.pytorch.org/whl/cpu && \ pip install --no-cache-dir \ - transformers datasets accelerate sentencepiece protobuf + "transformers>=4.36,<5" "datasets>=2.16,<3" "accelerate>=0.25,<1" \ + "sentencepiece>=0.1.99,<1" "protobuf>=4.25,<5" # Create non-root verifier user RUN useradd -m -s /bin/bash verifier diff --git a/backend/verification/containers/reproducibility.Dockerfile b/backend/verification/containers/reproducibility.Dockerfile index 517b604..b44f1fe 100644 --- a/backend/verification/containers/reproducibility.Dockerfile +++ b/backend/verification/containers/reproducibility.Dockerfile @@ -5,8 +5,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir \ - numpy scipy pandas scikit-learn matplotlib seaborn \ - jupyter pyyaml toml + "numpy>=1.24,<2" "scipy>=1.11,<2" "pandas>=2.0,<3" \ + "scikit-learn>=1.3,<2" "matplotlib>=3.7,<4" "seaborn>=0.13,<1" \ + "jupyter>=1.0,<2" "pyyaml>=6.0,<7" "toml>=0.10,<1" # Create non-root verifier user RUN useradd -m -s /bin/bash verifier diff --git a/backend/verification/cross_cutting_runner.py b/backend/verification/cross_cutting_runner.py index 3b35f0a..2068ceb 100644 --- a/backend/verification/cross_cutting_runner.py +++ b/backend/verification/cross_cutting_runner.py @@ -76,9 +76,16 @@ async def run_cross_cutting( names=[v.name for v in applicable], ) - results = await asyncio.gather( - *[_run_single(v, task_result, task_metadata) for v in applicable] - ) + try: + results = await asyncio.wait_for( + asyncio.gather( + *[_run_single(v, task_result, task_metadata) for v in applicable] + ), + timeout=120, + ) + except asyncio.TimeoutError: + logger.warning("cross_cutting_gather_timeout", names=[v.name for v in applicable]) + return [] return list(results) diff --git a/backend/verification/lean4_adapter.py b/backend/verification/lean4_adapter.py index 3d9c71b..09897fb 100644 --- a/backend/verification/lean4_adapter.py +++ b/backend/verification/lean4_adapter.py @@ -1,6 +1,7 @@ """Mathematics verification via Lean 4 + Mathlib in Docker sandbox.""" import asyncio import tempfile +import re import time from pathlib import Path @@ -12,13 +13,13 @@ logger = get_logger(__name__) # Configurable via env -LEAN4_IMAGE = "clawdlab/lean4-mathlib:latest" +LEAN4_IMAGE = "clawdlab/lean4-mathlib:v4.3.0" LEAN4_TIMEOUT = 300 # 5 min max -COQ_IMAGE = "clawdlab/coq:latest" +COQ_IMAGE = "clawdlab/coq:8.18" COQ_TIMEOUT = 300 -ISABELLE_IMAGE = "clawdlab/isabelle:latest" +ISABELLE_IMAGE = "clawdlab/isabelle:2024" ISABELLE_TIMEOUT = 300 @@ -56,7 +57,9 @@ async def _verify_theorem( start = time.monotonic() # Build the full .lean file - imports = "\n".join(f"import {dep}" for dep in dependencies) if dependencies else "import Mathlib" + # Sanitize dependencies — only allow Mathlib.* and Lean stdlib patterns + safe_deps = [dep for dep in dependencies if re.match(r'^[A-Za-z][A-Za-z0-9_.]*$', dep)] + imports = "\n".join(f"import {dep}" for dep in safe_deps) if safe_deps else "import Mathlib" full_code = f"{imports}\n\n{proof_code}" with tempfile.TemporaryDirectory() as tmpdir: @@ -196,7 +199,8 @@ async def _verify_coq(self, task_result: dict) -> VerificationResult: dependencies = task_result.get("dependencies", []) # Build .v file - imports = "\n".join(f"Require Import {dep}." for dep in dependencies) if dependencies else "" + safe_deps = [dep for dep in dependencies if re.match(r'^[A-Za-z][A-Za-z0-9_.]*$', dep)] + imports = "\n".join(f"Require Import {dep}." for dep in safe_deps) if safe_deps else "" full_code = f"{imports}\n\n{proof_code}" if imports else proof_code with tempfile.TemporaryDirectory() as tmpdir: @@ -266,7 +270,7 @@ async def _verify_isabelle(self, task_result: dict) -> VerificationResult: proof_code = task_result.get("proof_code", "") statement = task_result.get("statement") - theory_name = task_result.get("theory_name", "Proof") + theory_name = re.sub(r'[^A-Za-z0-9_]', '_', task_result.get("theory_name", "Proof")) # Build .thy file full_code = f'theory {theory_name}\nimports Main\nbegin\n\n{proof_code}\n\nend' diff --git a/backend/verification/ml_repro_adapter.py b/backend/verification/ml_repro_adapter.py index c09d3de..cca217d 100644 --- a/backend/verification/ml_repro_adapter.py +++ b/backend/verification/ml_repro_adapter.py @@ -336,6 +336,9 @@ def _build_inference_script( model_id: str, benchmark: str, sample_size: int, ) -> str: """Generate a Python script for Docker-based live inference.""" + import json as _json + safe_model_id = _json.dumps(model_id) + safe_benchmark = _json.dumps(benchmark) return f'''#!/usr/bin/env python3 """Auto-generated inference script for live benchmark verification.""" import json @@ -354,7 +357,7 @@ def _build_inference_script( from transformers import AutoModelForCausalLM, AutoTokenizer import torch - model_id = "{model_id}" + model_id = {safe_model_id} tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True, diff --git a/backend/verification/reproducibility_executor.py b/backend/verification/reproducibility_executor.py index b54d61a..be06103 100644 --- a/backend/verification/reproducibility_executor.py +++ b/backend/verification/reproducibility_executor.py @@ -8,6 +8,8 @@ import asyncio import hashlib import json +import re +import shlex import tempfile import time from pathlib import Path @@ -194,6 +196,13 @@ async def _execute( "outputs": {}, } + # Sanitize entry_point — must be a simple filename + if not re.match(r'^[A-Za-z0-9_.\-]+$', entry_point): + return False, { + "error": f"Invalid entry point name: {entry_point}", + "outputs": {}, + } + # Build docker command cmd = [ "docker", "run", "--rm",