Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ ENV PATH="/opt/venv/bin:$PATH"
# Copy application code
COPY api/ /app/api/
COPY schema/ /app/schema/
# Offline backup-object oracle: the single source of truth for kg-backup/2 validation.
# Shipped so the API can load it by path for POST /admin/backup/verify. Stdlib-only /
# standalone — no api-package coupling. Copy just the one file (not the dir) to avoid
# pulling in __pycache__ and unrelated lint scripts.
COPY scripts/development/lint/lint_backup.py /app/scripts/development/lint/lint_backup.py

# Set Python path for imports (api.app.lib.*)
ENV PYTHONPATH=/app
Expand Down
5 changes: 5 additions & 0 deletions api/Dockerfile.rocm-host
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ RUN grep -v "^torch" requirements.txt | grep -v "^torchvision" > requirements-no
# Copy application code
COPY api/ /app/api/
COPY schema/ /app/schema/
# Offline backup-object oracle: the single source of truth for kg-backup/2 validation.
# Shipped so the API can load it by path for POST /admin/backup/verify. Stdlib-only /
# standalone — no api-package coupling. Copy just the one file (not the dir) to avoid
# pulling in __pycache__ and unrelated lint scripts.
COPY scripts/development/lint/lint_backup.py /app/scripts/development/lint/lint_backup.py

# Set Python path for imports
ENV PYTHONPATH=/app
Expand Down
75 changes: 75 additions & 0 deletions api/app/lib/backup_oracle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
Adapter to the offline backup-object oracle (ADR-102).

The single source of truth for kg-backup/2 *spec* validation is the standalone
``scripts/development/lint/lint_backup.py`` — a stdlib-only oracle with no
api-package dependency, so it doubles as a CI/test gate and loads standalone by
path (ADR-102 Track D / P6c). This module loads that oracle by path and exposes
a thin :func:`validate_backup_object`, so the API (``POST /admin/backup/verify``)
runs the *same* checks server-side — no reimplementation, no cross-language drift.

The oracle file is shipped into the API image (see ``api/Dockerfile``:
``COPY scripts/development/lint/``) and is present at the repo root in dev.
"""

from __future__ import annotations

import importlib.util
from pathlib import Path
from typing import Any, Dict

# Candidate locations for the standalone oracle, in priority order: resolved
# relative to this file (api/app/lib -> repo root), the container path, then cwd.
_ORACLE_PATH_CANDIDATES = [
Path(__file__).resolve().parents[3] / "scripts" / "development" / "lint" / "lint_backup.py",
Path("/app/scripts/development/lint/lint_backup.py"),
Path.cwd() / "scripts" / "development" / "lint" / "lint_backup.py",
]

_oracle = None # lazily-loaded module, cached after first load


def _load_oracle():
"""Load (once) the standalone ``lint_backup`` module by file path.

Mirrors the importlib-by-path loading the pytest suites use, so the API runs
the exact same oracle. Raises FileNotFoundError if the file is absent (e.g.
an API image built without the ``COPY scripts/development/lint/`` line).
"""
global _oracle
if _oracle is not None:
return _oracle
for path in _ORACLE_PATH_CANDIDATES:
if path.is_file():
spec = importlib.util.spec_from_file_location("kg_backup_oracle", str(path))
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
_oracle = module
return _oracle
raise FileNotFoundError(
"offline backup oracle (lint_backup.py) not found; looked in: "
+ ", ".join(str(p) for p in _ORACLE_PATH_CANDIDATES)
)


def validate_backup_object(obj: Dict[str, Any]) -> Dict[str, Any]:
"""Run the offline oracle on a parsed kg-backup/2 object; return a JSON report.

Returns ``{ok, format_version, errors, warnings, notices, issues}`` where
``issues`` is the full ordered list of ``{severity, code, message, location}``
and the severity buckets are convenience filters of that list.
"""
oracle = _load_oracle()
result = oracle.validate_backup(obj)
issues = [
{"severity": i.severity, "code": i.code, "message": i.message, "location": i.location}
for i in result.issues
]
return {
"ok": result.ok,
"format_version": result.format_version,
"errors": [i for i in issues if i["severity"] == "ERROR"],
"warnings": [i for i in issues if i["severity"] == "WARNING"],
"notices": [i for i in issues if i["severity"] == "NOTICE"],
"issues": issues,
}
101 changes: 100 additions & 1 deletion api/app/routes/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"""

import re
import json
import uuid
import shutil
import tempfile
Expand All @@ -39,7 +40,8 @@
from ..services.job_queue import get_job_queue
from ..lib.backup_streaming import create_backup_stream
from ..lib.backup_archive import stream_backup_archive, extract_backup_archive, cleanup_extracted_archive
from ..lib.backup_integrity import check_backup_integrity
from ..lib.backup_integrity import check_backup_integrity, check_backup_data
from ..lib.backup_oracle import validate_backup_object
from ..lib.age_client import AGEClient
from ..lib.encrypted_keys import EncryptedKeyStore
from pydantic import BaseModel
Expand Down Expand Up @@ -423,6 +425,103 @@ async def restore_backup(
)


@router.post("/backup/verify")
async def verify_backup(
current_user: CurrentUser,
_: None = Depends(require_permission("backups", "read")),
file: UploadFile = File(..., description="Backup file (.tar.gz archive or .json)")
):
"""
Validate a kg-backup/2 backup object WITHOUT restoring it (ADR-102).

Runs the **offline oracle** — the single source of truth for kg-backup/2 spec
validation (``scripts/development/lint/lint_backup.py``) — server-side against
the uploaded backup, so the CLI/web do not reimplement validation (no
cross-language drift). Returns its structured report: ``errors`` / ``warnings``
/ ``notices``, each with a stable ``code`` and JSON-path ``location``, plus
best-effort record-count ``statistics`` (de-interned view).

Read-only: no graph access, nothing queued, no mutation. Accepts the same two
containers as ``/restore`` — ``.tar.gz`` (manifest.json extracted) or ``.json``.

**Authorization:** Requires ``backups:read`` (admin-gated by default; grant
``backups:read`` to another role to delegate verification).
"""
filename = file.filename or ""
is_archive = filename.endswith('.tar.gz')
is_json = filename.endswith('.json')
if not is_archive and not is_json:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Backup file must be .tar.gz archive or .json format"
)

temp_file_id = uuid.uuid4()
archive_temp_dir = None
archive_path = (
Path(tempfile.gettempdir()) / f"verify_{temp_file_id}.tar.gz" if is_archive else None
)
temp_path = Path(tempfile.gettempdir()) / f"verify_{temp_file_id}.json"

try:
if is_archive:
with open(archive_path, "wb") as temp_file:
shutil.copyfileobj(file.file, temp_file)
archive_temp_dir, manifest_path = extract_backup_archive(str(archive_path))
temp_path = Path(manifest_path)
archive_path.unlink()
else:
with open(temp_path, "wb") as temp_file:
shutil.copyfileobj(file.file, temp_file)

with open(temp_path, "r", encoding="utf-8") as f:
obj = json.load(f)

# Single source of truth: run the offline oracle server-side.
report = validate_backup_object(obj)

# Best-effort record-count statistics (de-interned view). Skipped if the
# object is too malformed for the reader — the oracle already reported why.
try:
integrity = check_backup_data(obj)
report["statistics"] = integrity.statistics or {}
report["external_deps"] = getattr(integrity, "external_deps", 0)
except Exception:
report["statistics"] = {}
report["external_deps"] = 0

report["filename"] = filename
logger.info(
f"Verified backup {filename!r}: ok={report['ok']} "
f"errors={len(report['errors'])} warnings={len(report['warnings'])}"
)
return report

except json.JSONDecodeError as e:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Backup file is not valid JSON: {e}"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Backup verify failed: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Backup verify failed: {str(e)}"
)
finally:
# Clean up everything we may have created — including the saved .tar.gz when
# extraction throws before its in-try unlink (review: avoid a temp leak on
# exactly the malformed-archive case verify exists to catch).
if archive_path is not None and archive_path.exists():
archive_path.unlink()
if temp_path.exists():
temp_path.unlink()
if archive_temp_dir:
cleanup_extracted_archive(archive_temp_dir)


# ========== Database Reset REMOVED - Too Dangerous for API ==========
#
# Database reset has been moved to initialize-platform.sh option 0 for security:
Expand Down
39 changes: 39 additions & 0 deletions cli/src/api/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1334,6 +1334,45 @@ export class KnowledgeGraphClient {
return response.data;
}

/**
* Verify a kg-backup/2 backup file WITHOUT restoring it (ADR-102).
*
* Uploads the file to POST /admin/backup/verify, which runs the offline oracle
* (the single source of truth — scripts/development/lint/lint_backup.py) server-
* side and returns its structured report. No validation logic lives in the CLI.
*/
async verifyBackup(
backupFilePath: string,
onUploadProgress?: (uploaded: number, total: number, percent: number) => void
): Promise<{
ok: boolean;
format_version: string | null;
errors: Array<{ severity: string; code: string; message: string; location: string }>;
warnings: Array<{ severity: string; code: string; message: string; location: string }>;
notices: Array<{ severity: string; code: string; message: string; location: string }>;
issues: Array<{ severity: string; code: string; message: string; location: string }>;
statistics?: Record<string, number>;
external_deps?: number;
filename?: string;
}> {
const form = new FormData();
form.append('file', fs.createReadStream(backupFilePath));

const response = await this.client.post('/admin/backup/verify', form, {
headers: form.getHeaders(),
onUploadProgress: (progressEvent) => {
if (onUploadProgress && progressEvent.total) {
const uploaded = progressEvent.loaded;
const total = progressEvent.total;
const percent = Math.round((uploaded / total) * 100);
onUploadProgress(uploaded, total, percent);
}
}
});

return response.data;
}

/**
* Get job scheduler status and statistics (ADR-014)
*/
Expand Down
122 changes: 122 additions & 0 deletions cli/src/cli/admin/backup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -392,3 +392,125 @@ export function createRestoreCommand(): Command {
}
});
}

export function createVerifyBackupCommand(): Command {
return new Command('verify-backup')
.description('Validate a backup file without restoring it (runs the server-side oracle)')
.argument('[file]', 'Path to a backup .tar.gz or .json (omit to pick from the backup directory)')
.option('--file <name>', 'Backup filename from the configured backup directory')
.action(async (fileArg: string | undefined, options: any) => {
try {
const client = createClientFromEnv();
const config = getConfig();
const backupDir = config.getBackupDir();

console.log('\n' + separator());
console.log(colors.ui.title('🔎 Verify Backup'));
console.log(separator());

// Resolve the backup file: positional arg → --file (from dir) → interactive.
let backupFilePath: string;
if (fileArg) {
backupFilePath = fileArg;
} else if (options.file) {
backupFilePath = path.join(backupDir, options.file);
} else {
if (!fs.existsSync(backupDir)) {
console.error(colors.status.error('\n✗ No backups available - directory does not exist'));
console.log(colors.status.dim(`Directory: ${backupDir}\n`));
process.exit(1);
}
const backups = fs.readdirSync(backupDir)
.filter(f => f.endsWith('.tar.gz') || f.endsWith('.json'))
.map(filename => {
const filepath = path.join(backupDir, filename);
return { filename, path: filepath, size_mb: fs.statSync(filepath).size / (1024 * 1024) };
})
.sort((a, b) => b.size_mb - a.size_mb);
if (backups.length === 0) {
console.error(colors.status.error('\n✗ No backups available'));
console.log(colors.status.dim(`Directory: ${backupDir}\n`));
process.exit(1);
}
console.log('\n' + colors.ui.key('Available Backups:'));
backups.slice(0, 10).forEach((b, i) => {
console.log(` ${i + 1}. ${b.filename} (${b.size_mb.toFixed(2)} MB)`);
});
const choice = await prompt('\nSelect backup [1-10] or enter filename: ');
if (/^\d+$/.test(choice)) {
const index = parseInt(choice) - 1;
if (index < 0 || index >= backups.length) {
console.error(colors.status.error('✗ Invalid selection'));
process.exit(1);
}
backupFilePath = backups[index].path;
} else {
backupFilePath = path.join(backupDir, choice);
}
}

if (!fs.existsSync(backupFilePath)) {
console.error(colors.status.error(`\n✗ Backup file not found: ${backupFilePath}\n`));
process.exit(1);
}

const ora = require('ora');
const spinner = ora('Uploading & validating...').start();
let report;
try {
report = await client.verifyBackup(backupFilePath, (uploaded, total, percent) => {
const u = (uploaded / (1024 * 1024)).toFixed(2);
const t = (total / (1024 * 1024)).toFixed(2);
spinner.text = `Uploading & validating... ${percent}% (${u}/${t} MB)`;
});
} catch (uploadError) {
spinner.fail('Verification request failed');
throw uploadError;
}
spinner.stop();

// Report
console.log('');
if (report.format_version) {
console.log(` ${colors.ui.key('Format:')} ${report.format_version}`);
}
const stats = report.statistics || {};
if (Object.keys(stats).length > 0) {
const parts = ['concepts', 'sources', 'instances', 'relationships', 'vocabulary']
.filter(k => stats[k] !== undefined)
.map(k => `${stats[k]} ${k}`);
console.log(` ${colors.ui.key('Contents:')} ${parts.join(', ')}`);
}
if (report.external_deps) {
console.log(` ${colors.ui.key('External deps:')} ${report.external_deps}`);
}

const printIssue = (i: any, colorFn: (s: string) => string) => {
const loc = i.location ? ` at ${i.location}` : '';
console.log(' ' + colorFn(`[${i.severity}] ${i.code}: ${i.message}${loc}`));
};
if (report.errors.length || report.warnings.length || report.notices.length) {
console.log('');
}
report.errors.forEach(i => printIssue(i, colors.status.error));
report.warnings.forEach(i => printIssue(i, colors.status.warning));
report.notices.forEach(i => printIssue(i, colors.status.dim));

console.log('\n' + separator());
if (report.ok) {
console.log(colors.status.success(
`✓ Valid backup (${report.warnings.length} warning(s), ${report.notices.length} notice(s))`));
console.log(separator() + '\n');
} else {
console.log(colors.status.error(
`✗ Invalid backup — ${report.errors.length} error(s), ${report.warnings.length} warning(s)`));
console.log(separator() + '\n');
process.exit(1);
}
} catch (error: any) {
console.error(colors.status.error('✗ Verification failed'));
console.error(colors.status.error(error.response?.data?.detail || error.message));
process.exit(1);
}
});
}
Loading
Loading