diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/README.md b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/README.md new file mode 100644 index 00000000..2f6d863c --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/README.md @@ -0,0 +1,3 @@ +# Vector Addition 2^28 Throughput + +This challenge ports Frontier-CS `research/problems/vector_addition/2_28` into Agentics as a `coexecuted_benchmark` payload. Public validation is tiny; official configuration/data is supplied through the private `official-runs` overlay. The private overlay contains no secrets. diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/agentics.challenge.json b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/agentics.challenge.json new file mode 100644 index 00000000..ce3c6ea8 --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/agentics.challenge.json @@ -0,0 +1,34 @@ +{ + "schema_version": 1, + "request": "new_challenge", + "challenge_name": "vector-add-2-28-frontier-cs-vector-add-2-28", + "title": "Vector Addition 2^28 Throughput", + "summary": { + "en": "Optimize a Triton vector-addition kernel for 2^28 CUDA elements.", + "zh": "Optimize a Triton vector-addition kernel for 2^28 CUDA elements." + }, + "keywords": [ + "cuda", + "vector", + "triton" + ], + "readme_path": "README.md", + "bundle_path": "v1", + "private_assets": [ + { + "asset_name": "official-runs", + "kind": "private_benchmark_data", + "required": true, + "required_paths": [ + "private-benchmark/config.json", + "private-benchmark/submission_spec.json" + ], + "asset_note": "Private official data/config for Frontier-CS `research/problems/vector_addition/2_28`." + } + ], + "ci": { + "validate_manifest": true, + "validate_public_bundle": true, + "smoke_test_public_validation": false + } +} diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/coexecuted-evaluator/run.py b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/coexecuted-evaluator/run.py new file mode 100644 index 00000000..9ddfbf21 --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/coexecuted-evaluator/run.py @@ -0,0 +1,410 @@ +from __future__ import annotations + +import argparse +import contextlib +import importlib.util +import io +import json +import math +import os +import sys +from pathlib import Path +from typing import Any + +ENV_PROJECT_DIR = "evaluator-env" +ENV_ACTIVE = "AGENTICS_EVALUATOR_ENV_ACTIVE" +MAX_LOG_CHARS = 4000 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Agentics coexecuted evaluator wrapper") + parser.add_argument("--challenge-dir", required=True) + parser.add_argument("--workspace-dir", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument("--mode", choices=["validation", "official"], required=True) + parser.add_argument("--target", required=True) + parser.add_argument("--setup-dir") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + output_path = Path(args.output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + configure_runtime_cache(output_path.parent) + maybe_reexec_with_setup_python(args) + + challenge_dir = Path(args.challenge_dir).resolve() + workspace_dir = Path(args.workspace_dir).resolve() + config = load_mode_config(challenge_dir, args.mode) + declared_metrics = declared_metric_names(challenge_dir) + logs: list[str] = [] + try: + with captured_logs(logs): + result = dispatch(config, challenge_dir, workspace_dir, output_path.parent, args.mode) + except Exception as exc: # noqa: BLE001 - result.json must explain evaluator failures. + result = {"status": "error", "score": 0.0, "score_unbounded": 0.0, "runs_successfully": 0.0, "error": str(exc)} + write_agentics_result(output_path, args.mode, result, logs, declared_metrics) + return 0 + + +def configure_runtime_cache(output_root: Path) -> None: + tmp_root = output_root / "tmp" + tmp_root.mkdir(parents=True, exist_ok=True) + os.environ.setdefault("HOME", str(output_root)) + os.environ.setdefault("TMPDIR", str(tmp_root)) + os.environ.setdefault("XDG_CACHE_HOME", str(tmp_root / "cache")) + os.environ.setdefault("TRITON_CACHE_DIR", str(tmp_root / "triton-cache")) + os.environ.setdefault("PYTHONDONTWRITEBYTECODE", "1") + + +def maybe_reexec_with_setup_python(args: argparse.Namespace) -> None: + if os.environ.get(ENV_ACTIVE) == "1" or not args.setup_dir: + return + venv_python = Path(args.setup_dir) / ENV_PROJECT_DIR / ".venv" / "bin" / "python" + if not venv_python.is_file(): + return + env = os.environ.copy() + env[ENV_ACTIVE] = "1" + os.execve(str(venv_python), [str(venv_python), *sys.argv], env) + + +def load_mode_config(challenge_dir: Path, mode: str) -> dict[str, Any]: + path = challenge_dir / ("public/config.json" if mode == "validation" else "private-benchmark/config.json") + if not path.is_file(): + raise RuntimeError(f"missing {mode} config at {path}") + payload = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise RuntimeError("mode config must be a JSON object") + return payload + + +def declared_metric_names(challenge_dir: Path) -> set[str]: + payload = json.loads((challenge_dir / "spec.json").read_text(encoding="utf-8")) + metrics = payload.get("metric_schema", {}).get("metrics", []) + if not isinstance(metrics, list): + return set() + names: set[str] = set() + for metric in metrics: + if isinstance(metric, dict) and isinstance(metric.get("name"), str): + names.add(metric["name"]) + return names + + +@contextlib.contextmanager +def captured_logs(logs: list[str]): + stdout = io.StringIO() + stderr = io.StringIO() + with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr): + yield + text = (stdout.getvalue() + "\n" + stderr.getvalue()).strip() + if text: + logs.append(truncate(text)) + + +def dispatch(config: dict[str, Any], challenge_dir: Path, workspace_dir: Path, output_dir: Path, mode: str) -> dict[str, Any]: + runner = config.get("runner") + if runner == "frontier_python_evaluate": + return run_frontier_python_evaluate(config, challenge_dir, workspace_dir, output_dir) + if runner == "sql_fuzzer": + return run_sql_fuzzer(config, challenge_dir, workspace_dir, output_dir) + if runner == "imagenet_pareto": + return run_imagenet(config, challenge_dir, workspace_dir) + if runner == "llm_router": + return run_llm_router(config, challenge_dir, workspace_dir) + if runner == "llm_sql": + return run_llm_sql(config, challenge_dir, workspace_dir, output_dir) + if runner == "symbolic_regression": + return run_symbolic(config, challenge_dir, workspace_dir) + if runner == "vdb_pareto": + return run_vdb(config, challenge_dir, workspace_dir) + if runner == "nbody": + return run_nbody(config, challenge_dir, workspace_dir) + raise RuntimeError(f"unsupported runner {runner!r}") + + +def import_module(path: Path, name: str) -> Any: + spec = importlib.util.spec_from_file_location(name, path) + if spec is None or spec.loader is None: + raise ImportError(f"failed to import {path}") + module = importlib.util.module_from_spec(spec) + sys.modules[name] = module + spec.loader.exec_module(module) + return module + + +def run_frontier_python_evaluate(config: dict[str, Any], challenge_dir: Path, workspace_dir: Path, output_dir: Path) -> dict[str, Any]: + sys.path.insert(0, str(challenge_dir / "resources")) + source = import_module(challenge_dir / "source-evaluator.py", "frontier_source_evaluator") + apply_benchmark_override(source, config) + solution_path = workspace_dir / "solution.py" + spec_path = challenge_dir / str(config.get("submission_spec_path", "resources/submission_spec.json")) + cwd = Path.cwd() + output_dir.mkdir(parents=True, exist_ok=True) + try: + os.chdir(output_dir) + return source.evaluate(solution_path, spec_path) + finally: + os.chdir(cwd) + + +def apply_benchmark_override(source: Any, config: dict[str, Any]) -> None: + override = config.get("benchmark_override") + if not override: + return + if override == "vector_sizes": + sizes = [int(value) for value in config.get("sizes", [])] + if sizes: + source._determine_large_test_sizes = lambda: sizes + if "num_samples" in config: + source.NUM_VECTOR_SAMPLES = int(config["num_samples"]) + if "gpu_warmups" in config: + source.GPU_WARMUP_ITERS = int(config["gpu_warmups"]) + if "inner_warmups" in config: + source.INNER_ADD_WARMUP_ITERS = int(config["inner_warmups"]) + return + + import benchmark # type: ignore + + if override == "gemm_shapes": + shapes = [tuple(item) for item in config["shapes"]] + baseline = source.baseline_matmul + def run_benchmark(answer, baseline_matmul=baseline, print_output=False): + rows = [benchmark._bench_pair(int(m), int(n), int(k), answer, baseline_matmul) for m, n, k in shapes] + return summarize_rows(rows) + source.run_benchmark = run_benchmark + return + + if override == "quant_dot_shapes": + shapes = [tuple(item) for item in config["shapes"]] + baseline = source.baseline_quant_dot + def run_benchmark(answer, baseline_fn=baseline, print_output=False): + rows = [benchmark._bench_pair(int(m), int(n), answer, baseline_fn) for m, n in shapes] + return summarize_rows(rows) + source.run_benchmark = run_benchmark + return + + if override == "qknorm_shapes": + shapes = [tuple(item) for item in config["shapes"]] + baseline = source.baseline_qknorm + def run_benchmark(answer, baseline_fn=baseline, print_output=False): + rows = [benchmark._bench_pair(int(b), int(kv), int(qo), int(hd), answer, baseline_fn) for b, kv, qo, hd in shapes] + return summarize_rows(rows) + source.run_benchmark = run_benchmark + return + + raise RuntimeError(f"unknown benchmark override {override}") + + +def summarize_rows(rows: list[dict[str, Any]]) -> dict[str, Any]: + speedups: list[float] = [] + for row in rows: + answer = finite(row.get("answer_ms", 0.0)) + baseline = finite(row.get("baseline_ms", row.get("gpu_baseline_ms", 0.0))) + if answer > 0 and baseline > 0: + speedups.append(baseline / answer) + if speedups: + arith = sum(speedups) / len(speedups) + geo = math.exp(sum(math.log(max(value, 1e-12)) for value in speedups) / len(speedups)) + median = sorted(speedups)[len(speedups) // 2] + else: + arith = geo = median = 0.0 + return { + "rows": rows, + "arithmetic_mean_speedup": arith, + "geometric_mean_speedup": geo, + "median_speedup": median, + "pass_all": all(bool(row.get("close_passed")) for row in rows), + } + + +def run_sql_fuzzer(config: dict[str, Any], challenge_dir: Path, workspace_dir: Path, output_dir: Path) -> dict[str, Any]: + sys.path.insert(0, str(challenge_dir / "resources")) + source = import_module(challenge_dir / "source-evaluator.py", "frontier_sql_fuzzer_evaluator") + solution_path = workspace_dir / "solution.py" + module = source.load_solution_module(solution_path) + solution = module.Solution() + artifact = solution.solve(str(challenge_dir / "resources")) + cwd = Path.cwd() + try: + os.chdir(output_dir) + artifact_path = source.materialize_artifact(artifact, solution_path) + fuzz = source.load_fuzzer_from_artifact(artifact_path) + result = source.evaluate_fuzzer(fuzz, challenge_dir / "resources", time_budget=float(config.get("time_budget_sec", 1.0))) + finally: + os.chdir(cwd) + return {"status": "success", "runs_successfully": 1.0, **result} + + +def run_imagenet(config: dict[str, Any], challenge_dir: Path, workspace_dir: Path) -> dict[str, Any]: + source = import_module(challenge_dir / "source-evaluator.py", "frontier_imagenet_evaluator") + for name, value in config.get("sample_overrides", {}).items(): + if hasattr(source, name): + setattr(source, name, int(value)) + module = source.load_solution_module(workspace_dir / "solution.py") + cls = getattr(module, "Solution") + evaluator = source.Evaluator() + return evaluator.evaluate(cls()) + + +def run_llm_router(config: dict[str, Any], challenge_dir: Path, workspace_dir: Path) -> dict[str, Any]: + source = import_module(challenge_dir / "source-evaluator.py", "frontier_llm_router_evaluator") + evaluator = source.Evaluator(str(challenge_dir)) + evaluator.trace_files = [str(challenge_dir / path) for path in config.get("datasets", [])] + return evaluator.evaluate(str(workspace_dir / "solution.py")) + + +def run_llm_sql(config: dict[str, Any], challenge_dir: Path, workspace_dir: Path, output_dir: Path) -> dict[str, Any]: + source = import_module(challenge_dir / "source-evaluator.py", "frontier_llm_sql_evaluator") + evaluator = source.Evaluator(str(challenge_dir)) + evaluator.trace_files = [str(challenge_dir / path) for path in config.get("datasets", [])] + if "col_merges" in config: + evaluator.col_merges = config["col_merges"] + cache_dir = output_dir / "tmp" + cache_dir.mkdir(parents=True, exist_ok=True) + evaluator.baseline_cache_file = str(cache_dir / "baseline_cache.json") + return evaluator.evaluate(str(workspace_dir / "solution.py")) + + +def run_symbolic(config: dict[str, Any], challenge_dir: Path, workspace_dir: Path) -> dict[str, Any]: + source = import_module(challenge_dir / "source-evaluator.py", "frontier_symbolic_evaluator") + refs = source.load_reference_metrics(challenge_dir / config["reference_path"]) + data_dir = challenge_dir / config["data_dir"] + data_files = sorted(data_dir.glob("*.csv")) + datasets = {path.name: path for path in data_files if path.name in refs} + if not datasets: + raise RuntimeError("no symbolic regression datasets matched reference metrics") + module = source.load_solution_module(workspace_dir / "solution.py") + by_dataset = source.evaluate(module, datasets, refs) + scores = [float(entry["score"]) for entry in by_dataset.values()] + scores_unbounded = [float(entry["score_unbounded"]) for entry in by_dataset.values()] + mse_values = [float(entry["mse"]) for entry in by_dataset.values()] + return { + "status": "success", + "runs_successfully": 1.0, + "score": sum(scores) / len(scores), + "score_unbounded": sum(scores_unbounded) / len(scores_unbounded), + "metrics": { + "mean_mse": sum(mse_values) / len(mse_values), + "num_datasets": len(by_dataset), + }, + } + + +def run_vdb(config: dict[str, Any], challenge_dir: Path, workspace_dir: Path) -> dict[str, Any]: + blocked = config.get("blocked_reason") + if blocked: + raise RuntimeError(str(blocked)) + source = import_module(challenge_dir / "source-evaluator.py", "frontier_vdb_evaluator") + module = source.load_solution_module(workspace_dir / "solution.py") + index_class = source.find_solution_class(module) + if config.get("dataset") == "synthetic": + import numpy as np + rng = np.random.default_rng(int(config.get("seed", 2026))) + dim = int(config.get("dim", 16)) + base = int(config.get("base_vectors", 128)) + queries = int(config.get("queries", 16)) + xb = rng.normal(size=(base, dim)).astype("float32") + xq = rng.normal(size=(queries, dim)).astype("float32") + distances = ((xq[:, None, :] - xb[None, :, :]) ** 2).sum(axis=2) + gt = np.argsort(distances, axis=1)[:, :1].astype("int64") + index = index_class(dim) + index.add(xb) + metrics = source.evaluate_index(index, xq, gt, int(config.get("k", 1))) + score = source.compute_score(metrics) + unbounded_cfg = dict(source.SCORE_CONFIG) + unbounded_cfg["scoring"] = dict(unbounded_cfg["scoring"]) + unbounded_cfg["scoring"]["max_score"] = float("inf") + unbounded_cfg["scoring"]["min_score"] = float("-inf") + score_unbounded = source.compute_score(metrics, unbounded_cfg) + return {"status": "success", "runs_successfully": 1.0, "score": score, "score_unbounded": score_unbounded, "metrics": metrics} + return source.evaluate(workspace_dir / "solution.py", k=int(config.get("k", 1))) + + +def run_nbody(config: dict[str, Any], challenge_dir: Path, workspace_dir: Path) -> dict[str, Any]: + common = import_module(challenge_dir / "nbody-common/evaluator_common.py", "frontier_nbody_common") + cfg = common.VariantConfig( + num_particles=int(config["num_particles"]), + num_iterations=int(config["num_iterations"]), + space_size=float(config["space_size"]), + num_runs=int(config["num_runs"]), + min_speedup=float(config["min_speedup"]), + max_speedup=float(config["max_speedup"]), + ) + return common.evaluate(workspace_dir / "solution.cpp", challenge_dir / "nbody-common", cfg) + + +def write_agentics_result(output_path: Path, mode: str, result: dict[str, Any], logs: list[str], declared_metrics: set[str]) -> None: + score = finite(result.get("score", 0.0)) + score_unbounded = finite(result.get("score_unbounded", score)) + error = result.get("error") + pass_all = result.get("pass_all") + runs_successfully = finite(result.get("runs_successfully", 1.0)) + correct = bool(result.get("correct", True)) + passed = error is None and runs_successfully > 0 and correct and (pass_all is not False) + metrics = collect_metrics(result, score, score_unbounded, passed, declared_metrics) + summary_key = "validation_summary" if mode == "validation" else "official_summary" + payload: dict[str, Any] = { + "status": "passed" if passed else "failed", + "mode": mode, + "rank_score": score, + "aggregate_metrics": metrics, + summary_key: {"score": score, "passed": 1 if passed else 0, "total": 1}, + "logs": [truncate(item) for item in logs[:4]], + } + if error is not None: + payload["logs"].append(truncate(str(error))) + if mode == "validation": + payload["public_results"] = [{"case_name": "public-validation", "status": payload["status"], "score": score, "message": truncate(str(error or "ok"), 500)}] + output_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") + + +def collect_metrics(result: dict[str, Any], score: float, score_unbounded: float, passed: bool, declared_metrics: set[str]) -> list[dict[str, float | str]]: + values: dict[str, float] = {} + + def set_declared(name: str, value: float) -> None: + if name in declared_metrics: + values[name] = value + + set_declared("score", score) + set_declared("score_unbounded", score_unbounded) + set_declared("runs_successfully", finite(result.get("runs_successfully", 1.0))) + set_declared("correctness", 1.0 if passed else 0.0) + + def add(name: str, value: Any) -> None: + if name in {"score", "score_unbounded", "runs_successfully", "correctness"} or name not in declared_metrics: + return + if isinstance(value, bool): + values[name] = 1.0 if value else 0.0 + elif isinstance(value, (int, float)) and math.isfinite(float(value)): + values[name] = float(value) + for key, value in result.items(): + if key in {"metrics", "by_dataset", "stdout", "stderr"}: + continue + add(key, value) + nested = result.get("metrics") + if isinstance(nested, dict): + for key, value in nested.items(): + if isinstance(value, dict): + continue + add(key, value) + return [{"metric_name": key, "value": value} for key, value in values.items()] + + +def finite(value: Any) -> float: + try: + number = float(value) + except Exception: + return 0.0 + return number if math.isfinite(number) else 0.0 + + +def truncate(value: str, limit: int = MAX_LOG_CHARS) -> str: + value = value.replace("\x00", "") + if len(value) <= limit: + return value + return value[:limit] + "... [truncated]" + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/coexecuted-evaluator/setup.py b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/coexecuted-evaluator/setup.py new file mode 100644 index 00000000..ff34c650 --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/coexecuted-evaluator/setup.py @@ -0,0 +1,39 @@ +from __future__ import annotations +import argparse, json, os, shutil, subprocess +from pathlib import Path +ENV_PROJECT_DIR = "evaluator-env" +PYTHON_INSTALL_DIR = "uv-python" +PYTHON_REQUEST = "3.12" +PYPROJECT = '[project]\nname = "vector_add_2_28_frontier_cs_vector_add_2_28"\nversion = "0.1.0"\nrequires-python = ">=3.12,<3.13"\ndependencies = [\n "torch>=2.11.0,<2.12.0",\n "triton>=3.5.0,<4",\n "numpy>=1.26",\n "tqdm>=4.64",\n]\n\n[tool.uv]\npackage = false\n\n[tool.uv.sources]\ntorch = [\n { index = "pytorch-cu130", marker = "sys_platform == \'linux\'" },\n]\n\n[[tool.uv.index]]\nname = "pytorch-cu130"\nurl = "https://download.pytorch.org/whl/cu130"\nexplicit = true\n' + +def main() -> int: + parser = argparse.ArgumentParser(description="Set up evaluator env") + parser.add_argument("--challenge-dir", required=True) + parser.add_argument("--setup-dir", required=True) + parser.add_argument("--mode", choices=["validation", "official"], required=True) + parser.add_argument("--target", required=True) + args = parser.parse_args() + setup_dir = Path(args.setup_dir) + project_dir = setup_dir / ENV_PROJECT_DIR + project_dir.mkdir(parents=True, exist_ok=True) + (project_dir / "pyproject.toml").write_text(PYPROJECT, encoding="utf-8") + env = os.environ.copy() + env["UV_CACHE_DIR"] = str(setup_dir / "uv-cache") + env["UV_LINK_MODE"] = "copy" + env["UV_PROJECT_ENVIRONMENT"] = str(project_dir / ".venv") + env["UV_PYTHON_INSTALL_DIR"] = str(setup_dir / PYTHON_INSTALL_DIR) + subprocess.run(["uv", "python", "install", PYTHON_REQUEST], check=True, env=env, timeout=180) + managed = find_managed_python(env) + subprocess.run(["uv", "sync", "--project", str(project_dir), "--python", str(managed), "--no-dev", "--no-install-project"], check=True, env=env, timeout=1200) + (project_dir / "agentics-env.json").write_text(json.dumps({"mode": args.mode, "target": args.target}, indent=2), encoding="utf-8") + shutil.rmtree(setup_dir / "uv-cache", ignore_errors=True) + return 0 + +def find_managed_python(env: dict[str, str]) -> Path: + result = subprocess.run(["uv", "python", "find", PYTHON_REQUEST, "--managed-python", "--resolve-links"], check=True, capture_output=True, text=True, env=env, timeout=60) + path = Path(result.stdout.strip()) + if not path.is_file(): + raise RuntimeError(f"managed Python not found at {path}") + return path +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/public/README.md b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/public/README.md new file mode 100644 index 00000000..516fd868 --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/public/README.md @@ -0,0 +1,3 @@ +# Public Validation + +Tiny deterministic validation config for `research/problems/vector_addition/2_28`. diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/public/config.json b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/public/config.json new file mode 100644 index 00000000..6b0ed234 --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/public/config.json @@ -0,0 +1,11 @@ +{ + "runner": "frontier_python_evaluate", + "submission_spec_path": "public/submission_spec.json", + "benchmark_override": "vector_sizes", + "sizes": [ + 1024 + ], + "num_samples": 1, + "gpu_warmups": 1, + "inner_warmups": 1 +} diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/public/submission_spec.json b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/public/submission_spec.json new file mode 100644 index 00000000..40b994da --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/public/submission_spec.json @@ -0,0 +1,15 @@ +{ + "problem_name": "vector_addition", + "description": "Triton kernel optimization problem for high-performance vector addition", + "requirements": { + "cuda_backend": true, + "gpu_required": true, + "triton_version": ">=2.1.0", + "torch_version": ">=2.0.0" + }, + "evaluation": { + "timeout_seconds": 300, + "memory_limit_mb": 8192, + "gpu_memory_limit_mb": 4096 + } +} \ No newline at end of file diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/resources/pyproject.toml b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/resources/pyproject.toml new file mode 100644 index 00000000..db74b8e4 --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/resources/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "vector-addition" +version = "0.1.0" +description = "Vector addition problem resources" +requires-python = ">=3.8" +dependencies = [] +# Docker image already has torch, triton, numpy, tqdm + +[build-system] +requires = ["setuptools>=45", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/resources/submission_spec.json b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/resources/submission_spec.json new file mode 100644 index 00000000..40b994da --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/resources/submission_spec.json @@ -0,0 +1,15 @@ +{ + "problem_name": "vector_addition", + "description": "Triton kernel optimization problem for high-performance vector addition", + "requirements": { + "cuda_backend": true, + "gpu_required": true, + "triton_version": ">=2.1.0", + "torch_version": ">=2.0.0" + }, + "evaluation": { + "timeout_seconds": 300, + "memory_limit_mb": 8192, + "gpu_memory_limit_mb": 4096 + } +} \ No newline at end of file diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/resources/vector-add.py b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/resources/vector-add.py new file mode 100644 index 00000000..90552a4c --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/resources/vector-add.py @@ -0,0 +1,139 @@ +""" +Vector Addition +=============== + +In this tutorial, you will write a simple vector addition using Triton. + +In doing so, you will learn about: + +* The basic programming model of Triton. + +* The `triton.jit` decorator, which is used to define Triton kernels. + +* The best practices for validating and benchmarking your custom ops against native reference implementations. + +""" + +# %% +# Compute Kernel +# -------------- + +import torch + +import triton +import triton.language as tl + +# Ensure CUDA is available and properly initialize device +if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available. This benchmark requires a CUDA-enabled GPU.") +DEVICE = torch.device("cuda:0") +torch.cuda.set_device(DEVICE) + + +@triton.jit +def add_kernel(x_ptr, # *Pointer* to first input vector. + y_ptr, # *Pointer* to second input vector. + output_ptr, # *Pointer* to output vector. + n_elements, # Size of the vector. + BLOCK_SIZE: tl.constexpr, # Number of elements each program should process. + # NOTE: `constexpr` so it can be used as a shape value. + ): + # There are multiple 'programs' processing different data. We identify which program + # we are here: + pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0. + # This program will process inputs that are offset from the initial data. + # For instance, if you had a vector of length 256 and block_size of 64, the programs + # would each access the elements [0:64, 64:128, 128:192, 192:256]. + # Note that offsets is a list of pointers: + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + # Create a mask to guard memory operations against out-of-bounds accesses. + mask = offsets < n_elements + # Load x and y from DRAM, masking out any extra elements in case the input is not a + # multiple of the block size. + x = tl.load(x_ptr + offsets, mask=mask) + y = tl.load(y_ptr + offsets, mask=mask) + output = x + y + # Write x + y back to DRAM. + tl.store(output_ptr + offsets, output, mask=mask) + + +# %% +# Let's also declare a helper function to (1) allocate the `z` tensor +# and (2) enqueue the above kernel with appropriate grid/block sizes: + + +def add(x: torch.Tensor, y: torch.Tensor): + # We need to preallocate the output. + output = torch.empty_like(x) + assert x.device == DEVICE and y.device == DEVICE and output.device == DEVICE + n_elements = output.numel() + # The SPMD launch grid denotes the number of kernel instances that run in parallel. + # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]. + # In this case, we use a 1D grid where the size is the number of blocks: + grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), ) + # NOTE: + # - Each torch.tensor object is implicitly converted into a pointer to its first element. + # - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel. + # - Don't forget to pass meta-parameters as keywords arguments. + add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) + # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still + # running asynchronously at this point. + return output + + +# %% +# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness: + +torch.manual_seed(0) +size = 98432 +x = torch.rand(size, device=DEVICE) +y = torch.rand(size, device=DEVICE) +output_torch = x + y +output_triton = add(x, y) +print(output_torch) +print(output_triton) +print(f'The maximum difference between torch and triton is ' + f'{torch.max(torch.abs(output_torch - output_triton))}') + +# %% +# Seems like we're good to go! + +# %% +# Benchmark +# --------- +# +# We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch. +# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom ops. +# for different problem sizes. + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=['size'], # Argument names to use as an x-axis for the plot. + x_vals=[2**i for i in range(12, 28, 1)], # Different possible values for `x_name`. + x_log=True, # x axis is logarithmic. + line_arg='provider', # Argument name whose value corresponds to a different line in the plot. + line_vals=['triton', 'torch'], # Possible values for `line_arg`. + line_names=['Triton', 'Torch'], # Label name for the lines. + styles=[('blue', '-'), ('green', '-')], # Line styles. + ylabel='GB/s', # Label name for the y-axis. + plot_name='vector-add-performance', # Name for the plot. Used also as a file name for saving the plot. + args={}, # Values for function arguments not in `x_names` and `y_name`. + )) +def benchmark(size, provider): + x = torch.rand(size, device=DEVICE, dtype=torch.float32) + y = torch.rand(size, device=DEVICE, dtype=torch.float32) + quantiles = [0.5, 0.2, 0.8] + if provider == 'torch': + ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y, quantiles=quantiles) + if provider == 'triton': + ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y), quantiles=quantiles) + gbps = lambda ms: 3 * x.numel() * x.element_size() * 1e-9 / (ms * 1e-3) + return gbps(ms), gbps(max_ms), gbps(min_ms) + + +# %% +# We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or +# `save_path='/path/to/results/' to save them to disk along with raw CSV data: +benchmark.run(print_data=True, show_plots=False) diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/source-evaluator.py b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/source-evaluator.py new file mode 100644 index 00000000..ce9b10b0 --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/source-evaluator.py @@ -0,0 +1,415 @@ +#!/usr/bin/env python3 +import argparse +import importlib.util +import json +import math +import os +import sys +from pathlib import Path +from types import ModuleType +from typing import Any, Dict, List, Tuple + +# Add resources to path for imports +HERE = Path(__file__).resolve().parent +RESOURCES_DIR = HERE / "resources" +sys.path.insert(0, str(RESOURCES_DIR)) + +import torch +import triton +import numpy as np + +DEFAULT_SPEC = HERE / "resources" / "submission_spec.json" +ARTIFACT_PATH = Path("./output_ans").resolve() + +DEVICE = triton.runtime.driver.active.get_active_torch_device() + + +def _determine_large_test_sizes() -> List[int]: + """Return test size: 2^28 (268,435,456 elements).""" + return [2**28] + + +DEFAULT_SEED = 1337 +NUM_VECTOR_SAMPLES = 5 +GPU_WARMUP_ITERS = 10 +INNER_ADD_WARMUP_ITERS = 5 + + +def warmup_gpu(iters: int = GPU_WARMUP_ITERS) -> None: + """Run a few trivial GPU ops to warm up kernels and clocks.""" + if not torch.cuda.is_available(): + return + torch.cuda.synchronize() + n = 1 << 20 + a = torch.rand(n, device=DEVICE, dtype=torch.float32) + b = torch.rand(n, device=DEVICE, dtype=torch.float32) + for _ in range(max(1, int(iters))): + c = a + b + torch.cuda.synchronize() + + +def load_solution_module(solution_path: Path) -> ModuleType: + """Load the solution module from the given path.""" + if not solution_path.exists(): + raise FileNotFoundError(f"solution.py not found at {solution_path}") + spec = importlib.util.spec_from_file_location("submitted_solution", solution_path) + if spec is None or spec.loader is None: + raise ImportError(f"Failed to load spec for {solution_path}") + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module # Register before exec for self-referential imports + spec.loader.exec_module(module) + return module + + +def materialize_artifact(result: Any, solution_path: Path) -> Path: + """Materialize the solution result into an artifact file.""" + ARTIFACT_PATH.parent.mkdir(parents=True, exist_ok=True) + if isinstance(result, dict): + with ARTIFACT_PATH.open("w", encoding="utf-8") as fout: + json.dump(result, fout) + return ARTIFACT_PATH + if isinstance(result, str): + # Check if the string could be a file path (reasonable length and no newlines) + # before calling is_file() to avoid "File name too long" errors + is_possible_path = len(result) < 4096 and '\n' not in result + if is_possible_path: + candidate = Path(result) + try: + if candidate.is_file(): + with ARTIFACT_PATH.open("w", encoding="utf-8") as fout: + json.dump({"program_path": str(candidate.resolve())}, fout) + return ARTIFACT_PATH + except OSError: + # Path too long or other OS error - treat as code string + pass + # Treat as code string + with ARTIFACT_PATH.open("w", encoding="utf-8") as fout: + fout.write(result) + return ARTIFACT_PATH + raise TypeError( + "Solution.solve() must return a dict/path-string/code-string; got " + f"{type(result)!r}." + ) + + +def load_add_from_artifact(artifact_path: Path) -> Any: + """Load the add function from the artifact.""" + with artifact_path.open("r", encoding="utf-8") as fin: + artifact = json.load(fin) + + if "code" in artifact: + # Write code to temporary file and import as module to avoid Triton source inspection issues + import tempfile + import os + + try: + # Create temporary file + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: + f.write(artifact["code"]) + temp_file = f.name + + # Import the module + spec = importlib.util.spec_from_file_location("temp_add_module", temp_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + if not hasattr(module, "add"): + raise ValueError("Code must define an 'add' function") + + # Don't delete temp file - Triton JIT needs source file at compile time + return module.add + except Exception as e: + raise + + elif "program_path" in artifact: + # Load from external file + program_path = Path(artifact["program_path"]) + if not program_path.exists(): + raise FileNotFoundError(f"Program file not found: {program_path}") + + spec = importlib.util.spec_from_file_location("submitted_program", program_path) + if spec is None or spec.loader is None: + raise ImportError(f"Failed to load spec for {program_path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + if not hasattr(module, "add"): + raise ValueError("Program must define an 'add' function") + return module.add + + else: + raise ValueError("Artifact must contain either 'code' or 'program_path'") + + +def benchmark_add(add_func: Any, sizes: List[int], seed: int = DEFAULT_SEED, num_samples: int = NUM_VECTOR_SAMPLES) -> Dict[str, Any]: + """Benchmark the add function against PyTorch baseline with seeding and averaging.""" + results = [] + + # Warm up the GPU for more stable timings + warmup_gpu(GPU_WARMUP_ITERS) + + for size in sizes: + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + + pytorch_ms_list = [] + cpu_ms_list = [] + custom_ms_list = [] + correctness_list = [] + + for sample_idx in range(max(1, int(num_samples))): + # Create test vectors deterministically + x = torch.rand(size, device=DEVICE, dtype=torch.float32) + y = torch.rand(size, device=DEVICE, dtype=torch.float32) + # CPU baseline vectors + x_cpu = x.detach().cpu() + y_cpu = y.detach().cpu() + + # PyTorch baseline (GPU) + def pytorch_add(): + return x + y + # Inner warmup additions before timing + if torch.cuda.is_available(): + for _ in range(INNER_ADD_WARMUP_ITERS): + _ = pytorch_add() + torch.cuda.synchronize() + pytorch_ms = triton.testing.do_bench(pytorch_add, quantiles=[0.5]) + if isinstance(pytorch_ms, (tuple, list)): + pytorch_ms = pytorch_ms[0] + pytorch_ms_list.append(float(pytorch_ms)) + + # Naive CPU baseline + def cpu_add(): + return x_cpu + y_cpu + cpu_ms = triton.testing.do_bench(cpu_add, quantiles=[0.5]) + if isinstance(cpu_ms, (tuple, list)): + cpu_ms = cpu_ms[0] + cpu_ms_list.append(float(cpu_ms)) + + # Custom implementation (GPU) + def custom_add(): + return add_func(x, y) + # Inner warmup additions before timing + if torch.cuda.is_available(): + for _ in range(INNER_ADD_WARMUP_ITERS): + _ = custom_add() + torch.cuda.synchronize() + custom_ms = triton.testing.do_bench(custom_add, quantiles=[0.5]) + if isinstance(custom_ms, (tuple, list)): + custom_ms = custom_ms[0] + custom_ms_list.append(float(custom_ms)) + + # Correctness test on this sample + pytorch_result = pytorch_add() + custom_result = custom_add() + is_correct = torch.allclose(pytorch_result, custom_result, rtol=1e-5, atol=1e-8) + correctness_list.append(bool(is_correct)) + + # Aggregate timings as medians for stability + def median(lst): + s = sorted(lst) + mid = len(s) // 2 + if len(s) % 2 == 1: + return s[mid] + return 0.5 * (s[mid - 1] + s[mid]) + + pytorch_ms = median(pytorch_ms_list) + cpu_ms = median(cpu_ms_list) + custom_ms = median(custom_ms_list) + + # Bandwidths (GB/s) + pytorch_bandwidth = 3 * size * 4 * 1e-9 / (pytorch_ms * 1e-3) + cpu_bandwidth = 3 * size * 4 * 1e-9 / (cpu_ms * 1e-3) + custom_bandwidth = 3 * size * 4 * 1e-9 / (custom_ms * 1e-3) + + is_correct = all(correctness_list) + + results.append({ + "size": size, + "pytorch_ms": pytorch_ms, + "cpu_ms": cpu_ms, + "custom_ms": custom_ms, + "pytorch_bandwidth": pytorch_bandwidth, + "cpu_bandwidth": cpu_bandwidth, + "custom_bandwidth": custom_bandwidth, + "speedup": pytorch_ms / custom_ms if custom_ms > 0 else 0.0, + "bandwidth_ratio": custom_bandwidth / cpu_bandwidth if cpu_bandwidth > 0 else 0.0, + "is_correct": is_correct, + }) + + return results + + +def evaluate_vector_addition(add_func: Any) -> Dict[str, Any]: + """Evaluate the performance of a vector addition implementation.""" + try: + # Use large sizes based on GPU memory so GPU >> CPU + sizes = _determine_large_test_sizes() + + # Run benchmark + results = benchmark_add(add_func, sizes) + + # Enforce strict correctness: if any test fails, score 0 + if not results or not all(r["is_correct"] for r in results): + return { + "error": "Correctness not 100% across all samples/sizes", + "score": 0, + "pass_all": False, + "total_tests": len(results), + "passed_tests": sum(1 for r in results if r.get("is_correct")), + "results": results, + } + + # Calculate metrics + bandwidth_ratios = [r["bandwidth_ratio"] for r in results if r["is_correct"]] + speedups = [r["speedup"] for r in results if r["is_correct"]] + pytorch_vs_cpu = [ + max(r["pytorch_bandwidth"] / max(r["cpu_bandwidth"], 1e-12), 1e-12) + for r in results if r["is_correct"] + ] + custom_vs_cpu = [ + max(r["custom_bandwidth"] / max(r["cpu_bandwidth"], 1e-12), 1e-12) + for r in results if r["is_correct"] + ] + + if not bandwidth_ratios: + return { + "error": "All correctness tests failed", + "score": 0, + "pass_all": False, + } + + geometric_mean_bandwidth_ratio = math.exp(sum(math.log(r) for r in bandwidth_ratios) / len(bandwidth_ratios)) + arithmetic_mean_bandwidth_ratio = sum(bandwidth_ratios) / len(bandwidth_ratios) + gm_pytorch_vs_cpu = math.exp(sum(math.log(r) for r in pytorch_vs_cpu) / len(pytorch_vs_cpu)) + gm_custom_vs_cpu = math.exp(sum(math.log(r) for r in custom_vs_cpu) / len(custom_vs_cpu)) + + # Calculate score (0-100 scale) + # Anchor 0 at CPU naive baseline (custom/cpu = 1x) + # Anchor 100 at 2x PyTorch GPU baseline (custom/cpu = 2 * pytorch/cpu) + target = max(2.0 * gm_pytorch_vs_cpu, 1.0 + 1e-12) + numerator = max(0.0, gm_custom_vs_cpu - 1.0) + denominator = max(target - 1.0, 1e-12) + normalized_unbounded = numerator / denominator + normalized = max(0.0, min(1.0, normalized_unbounded)) + score_unbounded = normalized_unbounded * 100.0 + score = max(0.0, min(100.0, score_unbounded)) + + return { + "geometric_mean_bandwidth_ratio": geometric_mean_bandwidth_ratio, + "arithmetic_mean_bandwidth_ratio": arithmetic_mean_bandwidth_ratio, + "geometric_mean_custom_vs_cpu": gm_custom_vs_cpu, + "geometric_mean_pytorch_vs_cpu": gm_pytorch_vs_cpu, + "score": score, + "score_unbounded": score_unbounded, + "pass_all": True, + "total_tests": len(results), + "passed_tests": sum(1 for r in results if r["is_correct"]), + "results": results, + } + + except Exception as e: + return { + "error": str(e), + "score": 0, + "pass_all": False, + } + + +def evaluate(solution_path: Path, spec_path: Path) -> dict: + """Main evaluation function.""" + try: + # Load solution module + module = load_solution_module(solution_path) + + if not hasattr(module, "Solution"): + raise ValueError("Solution module must define a 'Solution' class") + + solution_class = module.Solution + solution_instance = solution_class() + + if not hasattr(solution_instance, "solve"): + raise ValueError("Solution class must have a 'solve' method") + + # Get solution result + result = solution_instance.solve(spec_path) + + # Materialize artifact + artifact_path = materialize_artifact(result, solution_path) + + # Load add function from artifact + add_func = load_add_from_artifact(artifact_path) + + # Evaluate performance + evaluation_result = evaluate_vector_addition(add_func) + + return { + "status": "success", + "artifact_path": str(artifact_path), + **evaluation_result, + } + + except Exception as e: + return { + "status": "error", + "error": str(e), + "score": 0, + } + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate vector addition solutions") + parser.add_argument( + "--solution-path", + type=Path, + default=Path("./solution.py"), + help="Path to solution.py file", + ) + parser.add_argument( + "--spec-path", + type=Path, + default=DEFAULT_SPEC, + help="Path to specification file", + ) + parser.add_argument( + "--output-path", + type=Path, + default=Path("./result.json"), + help="Path to output result file", + ) + + args = parser.parse_args() + + # Run evaluation + result = evaluate(args.solution_path, args.spec_path) + + # Write result + with args.output_path.open("w", encoding="utf-8") as fout: + json.dump(result, fout, indent=2) + + # Print summary + if result["status"] == "success": + print(f"Evaluation completed successfully!") + print(f"Score: {result.get('score', 0):.2f}/100") + if 'geometric_mean_bandwidth_ratio' in result: + print(f"Geometric mean bandwidth ratio: {result['geometric_mean_bandwidth_ratio']:.3f}x") + if 'passed_tests' in result and 'total_tests' in result: + print(f"Tests passed: {result['passed_tests']}/{result['total_tests']}") + if 'error' in result: + print(f"Error: {result['error']}") + # Print score as last line for main_loop.sh to extract + # Format: "score score_unbounded" (space-separated) + score = result.get('score', 0) + score_unbounded = result.get('score_unbounded', score) + print(f"{score} {score_unbounded}") + else: + print(f"Evaluation failed: {result.get('error', 'Unknown error')}") + # Print error score as last line + print("0") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/spec.json b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/spec.json new file mode 100644 index 00000000..ea0919b2 --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/spec.json @@ -0,0 +1,166 @@ +{ + "schema_version": 1, + "challenge_name": "vector-add-2-28-frontier-cs-vector-add-2-28", + "challenge_title": "Vector Addition 2^28 Throughput", + "summary": { + "en": "Optimize a Triton vector-addition kernel for 2^28 CUDA elements.", + "zh": "Optimize a Triton vector-addition kernel for 2^28 CUDA elements." + }, + "keywords": [ + "cuda", + "vector", + "triton" + ], + "solution": { + "protocol": "zip_project", + "manifest_file": "agentics.solution.json" + }, + "targets": [ + { + "name": "linux-arm64-cuda", + "docker_platform": "linux/arm64", + "accelerator": "gpu", + "validation_enabled": true, + "resource_profile": { + "name": "agentics-cuda-cu130-gb10", + "solution_image": { + "source": "registry", + "reference": "ghcr.io/agentic-science/agentics-linux-arm64-cuda:cu130-ubuntu24.04-v0.2.5@sha256:8e3da4a65e297e3b1e9800da001fa2bbac9ed48453a6972117a0c3ad1d1eef13" + }, + "evaluator_image": { + "source": "registry", + "reference": "ghcr.io/agentic-science/agentics-linux-arm64-cuda:cu130-ubuntu24.04-v0.2.5@sha256:8e3da4a65e297e3b1e9800da001fa2bbac9ed48453a6972117a0c3ad1d1eef13" + }, + "solution": { + "setup": { + "timeout_sec": 120, + "memory_limit_mb": 2048, + "cpu_limit_millis": 2000, + "disk_limit_mb": 2048, + "network_access": "disabled" + }, + "build": { + "timeout_sec": 120, + "memory_limit_mb": 2048, + "cpu_limit_millis": 2000, + "disk_limit_mb": 2048, + "network_access": "disabled" + } + }, + "evaluator": { + "setup": { + "timeout_sec": 1200, + "memory_limit_mb": 6144, + "cpu_limit_millis": 4000, + "disk_limit_mb": 16384, + "network_access": "enabled" + }, + "run": { + "timeout_sec": 3600, + "memory_limit_mb": 8192, + "cpu_limit_millis": 8000, + "disk_limit_mb": 8192, + "network_access": "disabled" + } + }, + "resource_description": "ARM64 CUDA 13.0 profile for Frontier-CS coexecuted GPU benchmarks.", + "hardware_metadata": { + "kind": "cuda", + "gpu_model": "NVIDIA GB10", + "gpu_count": 1, + "cuda_variant": "cu130", + "cuda_version": "13.0", + "driver_minimum": "580.142" + } + } + } + ], + "starts_at": "2026-01-01T00:00:00Z", + "eligibility": { + "type": "open" + }, + "visibility": { + "leaderboard": "public_live", + "score_distribution": "public_live", + "result_detail": "submitter_live_public_live" + }, + "solution_publication": "public", + "execution": { + "mode": "coexecuted_benchmark", + "coexecuted_evaluator": { + "command": [ + "python", + "coexecuted-evaluator/run.py" + ], + "result_file": "result.json" + }, + "acknowledge_danger": true, + "validation_setup": { + "command": [ + "python", + "coexecuted-evaluator/setup.py" + ], + "reproducibility_notes": "Creates a uv-managed project environment under /setup using uv sync; no uv pip interface is used." + }, + "official_evaluation_setup": { + "command": [ + "python", + "coexecuted-evaluator/setup.py" + ], + "reproducibility_notes": "Creates the same uv-managed project environment for official evaluation; private benchmark data is not used for dependency resolution." + } + }, + "datasets": { + "public_dir": "public", + "private_benchmark_dir": "private-benchmark", + "public_policy": "full", + "private_benchmark_policy": "score_only", + "private_benchmark_enabled": true + }, + "metric_schema": { + "metrics": [ + { + "name": "score", + "label": "Score", + "direction": "maximize", + "visibility": "public", + "metric_description": "Score" + }, + { + "name": "score_unbounded", + "label": "Unbounded Score", + "direction": "maximize", + "visibility": "public", + "metric_description": "Unbounded Score" + }, + { + "name": "correctness", + "label": "Correctness", + "direction": "maximize", + "visibility": "public", + "metric_description": "Correctness" + }, + { + "name": "geometric_mean_speedup", + "label": "Geometric Mean Speedup", + "direction": "maximize", + "visibility": "public", + "metric_description": "Geometric Mean Speedup" + }, + { + "name": "passed_tests", + "label": "Passed Tests", + "direction": "maximize", + "visibility": "public", + "metric_description": "Passed Tests" + } + ], + "ranking": { + "primary_metric_name": "score", + "tie_breaker_metric_names": [ + "score_unbounded", + "correctness" + ] + } + } +} diff --git a/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/statement.md b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/statement.md new file mode 100644 index 00000000..d2658e98 --- /dev/null +++ b/challenges/vector-add-2-28-frontier-cs-vector-add-2-28/v1/statement.md @@ -0,0 +1,111 @@ +# Vector Addition 2^28 Throughput + +Ported from Frontier-CS `research/problems/vector_addition/2_28`. + +## Agentics Interface + +Submit a ZIP project containing the source interface described below. The trusted evaluator imports or compiles participant code from `/workspace`, so this challenge uses `coexecuted_benchmark` with `acknowledge_danger: true`. + +## Public And Official Data + +Public validation uses a small deterministic configuration committed under `v1/public`. Official scoring uses the private `official-runs` overlay under `private-benchmark/`. + +## Original Statement + +Vector Addition Problem - Very Large Vectors (2^28) +============================================== + +Problem Setting +--------------- +Design and optimize high-performance Triton kernels for vector addition on GPU with very large vectors (268,435,456 elements). This problem focuses on implementing efficient element-wise addition for maximum throughput scenarios. + +The challenge involves optimizing: +- **Memory access patterns**: Efficient loading and storing of large vector data +- **Block sizing**: Optimal block sizes for large GPU workloads +- **Memory bandwidth**: Maximizing throughput at scale +- **Performance benchmarking**: Achieving speedup over PyTorch baseline + +This variant tests performance on very large vectors (2^28 = 268,435,456 elements = 1 GB per vector). Requires ~3 GB GPU memory total. + +Target +------ +- **Primary**: Maximize bandwidth (GB/s) over PyTorch baseline (higher is better) +- **Secondary**: Ensure correctness on large vectors +- **Tertiary**: Minimize memory overhead + +API Specification +----------------- +Implement a `Solution` class that returns a Triton kernel implementation: + +```python +class Solution: + def solve(self, spec_path: str = None) -> dict: + """ + Returns a dict with either: + - {"code": "python_code_string"} + - {"program_path": "path/to/kernel.py"} + """ + # Your implementation + pass +``` + +Your kernel implementation must provide: + +```python +import torch +import triton +import triton.language as tl + +def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + """ + Element-wise addition of two vectors. + + Args: + x: Input tensor of shape (268435456,) + y: Input tensor of shape (268435456,) + + Returns: + Output tensor of shape (268435456,) with x + y + """ + pass +``` + +API Usage Notes +--------------- +- The evaluator looks for an `add` function in the module namespace +- Function must handle vector size of exactly 268,435,456 elements +- Must use Triton JIT compilation for kernel definition +- Should optimize for maximum memory bandwidth at scale +- Input tensors are guaranteed to be contiguous and same size +- May cause OOM on GPUs with less than 3GB memory + +Scoring (0-100) +--------------- +Performance is measured against CPU baseline and PyTorch GPU baseline: + +``` +target = max(2.0 * (pytorch_bandwidth / cpu_bandwidth), 1.0) +score = ((custom_bandwidth / cpu_bandwidth - 1.0) / (target - 1.0)) * 100 + +Where: +- custom_bandwidth = your solution's bandwidth +- cpu_bandwidth = naive CPU baseline bandwidth +- pytorch_bandwidth = PyTorch GPU baseline bandwidth +- target = 2x PyTorch performance vs CPU (normalized to custom vs CPU) + +Score is clamped to [0, 100] range +``` + +- 0 points = CPU baseline performance (custom/cpu = 1x) +- 50 points = Halfway between CPU baseline and 2x PyTorch performance +- 100 points = 2x PyTorch GPU performance vs CPU (custom/cpu = 2 * pytorch/cpu) + +Evaluation Details +------------------ +- Tested on vector size: 2^28 = 268,435,456 elements +- Performance measured in GB/s (bandwidth) +- Correctness verified with tolerance: rtol=1e-5, atol=1e-8 +- Performance measured using median execution time across 5 samples +- Requires CUDA backend and GPU support +- Requires sufficient GPU memory (may OOM on smaller GPUs) + diff --git a/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/README.md b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/README.md new file mode 100644 index 00000000..d8e3b20a --- /dev/null +++ b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/README.md @@ -0,0 +1,3 @@ +# vector-add-2-28-frontier-cs-vector-add-2-28 Smoke Solution + +Cheap public-validation smoke solution. diff --git a/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/agentics.solution.json b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/agentics.solution.json new file mode 100644 index 00000000..034defa2 --- /dev/null +++ b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/agentics.solution.json @@ -0,0 +1,10 @@ +{ + "protocol": "zip_project", + "protocol_version": 1, + "note": "Cheap smoke solution for vector-add-2-28-frontier-cs-vector-add-2-28.", + "commands": { + "setup": "scripts/setup.sh", + "build": "scripts/build.sh", + "run": "run.sh" + } +} diff --git a/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/run.sh b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/run.sh new file mode 100755 index 00000000..94b6de34 --- /dev/null +++ b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/run.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env sh +set -eu +exit 0 diff --git a/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/scripts/build.sh b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/scripts/build.sh new file mode 100755 index 00000000..c72e926c --- /dev/null +++ b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/scripts/build.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env sh +set -eu diff --git a/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/scripts/setup.sh b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/scripts/setup.sh new file mode 100755 index 00000000..c72e926c --- /dev/null +++ b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/scripts/setup.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env sh +set -eu diff --git a/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/solution.py b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/solution.py new file mode 100644 index 00000000..58f9af64 --- /dev/null +++ b/test-solutions/vector-add-2-28-frontier-cs-vector-add-2-28/solution.py @@ -0,0 +1,6 @@ +from __future__ import annotations +class Solution: + def solve(self, spec_path=None): + return {"code":"""def add(x,y): + return x + y +"""}