Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Vector Addition 2^28 Throughput

This challenge ports Frontier-CS `research/problems/vector_addition/2_28` into Agentics as a `coexecuted_benchmark` payload. Public validation is tiny; official configuration/data is supplied through the private `official-runs` overlay. The private overlay contains no secrets.
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"schema_version": 1,
"request": "new_challenge",
"challenge_name": "vector-add-2-28-frontier-cs-vector-add-2-28",
"title": "Vector Addition 2^28 Throughput",
"summary": {
"en": "Optimize a Triton vector-addition kernel for 2^28 CUDA elements.",
"zh": "Optimize a Triton vector-addition kernel for 2^28 CUDA elements."
},
"keywords": [
"cuda",
"vector",
"triton"
],
"readme_path": "README.md",
"bundle_path": "v1",
"private_assets": [
{
"asset_name": "official-runs",
"kind": "private_benchmark_data",
"required": true,
"required_paths": [
"private-benchmark/config.json",
"private-benchmark/submission_spec.json"
],
"asset_note": "Private official data/config for Frontier-CS `research/problems/vector_addition/2_28`."
}
],
"ci": {
"validate_manifest": true,
"validate_public_bundle": true,
"smoke_test_public_validation": false
}
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from __future__ import annotations
import argparse, json, os, shutil, subprocess
from pathlib import Path
ENV_PROJECT_DIR = "evaluator-env"
PYTHON_INSTALL_DIR = "uv-python"
PYTHON_REQUEST = "3.12"
PYPROJECT = '[project]\nname = "vector_add_2_28_frontier_cs_vector_add_2_28"\nversion = "0.1.0"\nrequires-python = ">=3.12,<3.13"\ndependencies = [\n "torch>=2.11.0,<2.12.0",\n "triton>=3.5.0,<4",\n "numpy>=1.26",\n "tqdm>=4.64",\n]\n\n[tool.uv]\npackage = false\n\n[tool.uv.sources]\ntorch = [\n { index = "pytorch-cu130", marker = "sys_platform == \'linux\'" },\n]\n\n[[tool.uv.index]]\nname = "pytorch-cu130"\nurl = "https://download.pytorch.org/whl/cu130"\nexplicit = true\n'

def main() -> int:
parser = argparse.ArgumentParser(description="Set up evaluator env")
parser.add_argument("--challenge-dir", required=True)
parser.add_argument("--setup-dir", required=True)
parser.add_argument("--mode", choices=["validation", "official"], required=True)
parser.add_argument("--target", required=True)
args = parser.parse_args()
setup_dir = Path(args.setup_dir)
project_dir = setup_dir / ENV_PROJECT_DIR
project_dir.mkdir(parents=True, exist_ok=True)
(project_dir / "pyproject.toml").write_text(PYPROJECT, encoding="utf-8")
env = os.environ.copy()
env["UV_CACHE_DIR"] = str(setup_dir / "uv-cache")
env["UV_LINK_MODE"] = "copy"
env["UV_PROJECT_ENVIRONMENT"] = str(project_dir / ".venv")
env["UV_PYTHON_INSTALL_DIR"] = str(setup_dir / PYTHON_INSTALL_DIR)
subprocess.run(["uv", "python", "install", PYTHON_REQUEST], check=True, env=env, timeout=180)
managed = find_managed_python(env)
subprocess.run(["uv", "sync", "--project", str(project_dir), "--python", str(managed), "--no-dev", "--no-install-project"], check=True, env=env, timeout=1200)
(project_dir / "agentics-env.json").write_text(json.dumps({"mode": args.mode, "target": args.target}, indent=2), encoding="utf-8")
shutil.rmtree(setup_dir / "uv-cache", ignore_errors=True)
return 0

def find_managed_python(env: dict[str, str]) -> Path:
result = subprocess.run(["uv", "python", "find", PYTHON_REQUEST, "--managed-python", "--resolve-links"], check=True, capture_output=True, text=True, env=env, timeout=60)
path = Path(result.stdout.strip())
if not path.is_file():
raise RuntimeError(f"managed Python not found at {path}")
return path
if __name__ == "__main__":
raise SystemExit(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Public Validation

Tiny deterministic validation config for `research/problems/vector_addition/2_28`.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"runner": "frontier_python_evaluate",
"submission_spec_path": "public/submission_spec.json",
"benchmark_override": "vector_sizes",
"sizes": [
1024
],
"num_samples": 1,
"gpu_warmups": 1,
"inner_warmups": 1
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"problem_name": "vector_addition",
"description": "Triton kernel optimization problem for high-performance vector addition",
"requirements": {
"cuda_backend": true,
"gpu_required": true,
"triton_version": ">=2.1.0",
"torch_version": ">=2.0.0"
},
"evaluation": {
"timeout_seconds": 300,
"memory_limit_mb": 8192,
"gpu_memory_limit_mb": 4096
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[project]
name = "vector-addition"
version = "0.1.0"
description = "Vector addition problem resources"
requires-python = ">=3.8"
dependencies = []
# Docker image already has torch, triton, numpy, tqdm

[build-system]
requires = ["setuptools>=45", "wheel"]
build-backend = "setuptools.build_meta"
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"problem_name": "vector_addition",
"description": "Triton kernel optimization problem for high-performance vector addition",
"requirements": {
"cuda_backend": true,
"gpu_required": true,
"triton_version": ">=2.1.0",
"torch_version": ">=2.0.0"
},
"evaluation": {
"timeout_seconds": 300,
"memory_limit_mb": 8192,
"gpu_memory_limit_mb": 4096
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"""
Vector Addition
===============

In this tutorial, you will write a simple vector addition using Triton.

In doing so, you will learn about:

* The basic programming model of Triton.

* The `triton.jit` decorator, which is used to define Triton kernels.

* The best practices for validating and benchmarking your custom ops against native reference implementations.

"""

# %%
# Compute Kernel
# --------------

import torch

import triton
import triton.language as tl

# Ensure CUDA is available and properly initialize device
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available. This benchmark requires a CUDA-enabled GPU.")
DEVICE = torch.device("cuda:0")
torch.cuda.set_device(DEVICE)


@triton.jit
def add_kernel(x_ptr, # *Pointer* to first input vector.
y_ptr, # *Pointer* to second input vector.
output_ptr, # *Pointer* to output vector.
n_elements, # Size of the vector.
BLOCK_SIZE: tl.constexpr, # Number of elements each program should process.
# NOTE: `constexpr` so it can be used as a shape value.
):
# There are multiple 'programs' processing different data. We identify which program
# we are here:
pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0.
# This program will process inputs that are offset from the initial data.
# For instance, if you had a vector of length 256 and block_size of 64, the programs
# would each access the elements [0:64, 64:128, 128:192, 192:256].
# Note that offsets is a list of pointers:
block_start = pid * BLOCK_SIZE
offsets = block_start + tl.arange(0, BLOCK_SIZE)
# Create a mask to guard memory operations against out-of-bounds accesses.
mask = offsets < n_elements
# Load x and y from DRAM, masking out any extra elements in case the input is not a
# multiple of the block size.
x = tl.load(x_ptr + offsets, mask=mask)
y = tl.load(y_ptr + offsets, mask=mask)
output = x + y
# Write x + y back to DRAM.
tl.store(output_ptr + offsets, output, mask=mask)


# %%
# Let's also declare a helper function to (1) allocate the `z` tensor
# and (2) enqueue the above kernel with appropriate grid/block sizes:


def add(x: torch.Tensor, y: torch.Tensor):
# We need to preallocate the output.
output = torch.empty_like(x)
assert x.device == DEVICE and y.device == DEVICE and output.device == DEVICE
n_elements = output.numel()
# The SPMD launch grid denotes the number of kernel instances that run in parallel.
# It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
# In this case, we use a 1D grid where the size is the number of blocks:
grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
# NOTE:
# - Each torch.tensor object is implicitly converted into a pointer to its first element.
# - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel.
# - Don't forget to pass meta-parameters as keywords arguments.
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
# running asynchronously at this point.
return output


# %%
# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:

torch.manual_seed(0)
size = 98432
x = torch.rand(size, device=DEVICE)
y = torch.rand(size, device=DEVICE)
output_torch = x + y
output_triton = add(x, y)
print(output_torch)
print(output_triton)
print(f'The maximum difference between torch and triton is '
f'{torch.max(torch.abs(output_torch - output_triton))}')

# %%
# Seems like we're good to go!

# %%
# Benchmark
# ---------
#
# We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.
# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom ops.
# for different problem sizes.


@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=['size'], # Argument names to use as an x-axis for the plot.
x_vals=[2**i for i in range(12, 28, 1)], # Different possible values for `x_name`.
x_log=True, # x axis is logarithmic.
line_arg='provider', # Argument name whose value corresponds to a different line in the plot.
line_vals=['triton', 'torch'], # Possible values for `line_arg`.
line_names=['Triton', 'Torch'], # Label name for the lines.
styles=[('blue', '-'), ('green', '-')], # Line styles.
ylabel='GB/s', # Label name for the y-axis.
plot_name='vector-add-performance', # Name for the plot. Used also as a file name for saving the plot.
args={}, # Values for function arguments not in `x_names` and `y_name`.
))
def benchmark(size, provider):
x = torch.rand(size, device=DEVICE, dtype=torch.float32)
y = torch.rand(size, device=DEVICE, dtype=torch.float32)
quantiles = [0.5, 0.2, 0.8]
if provider == 'torch':
ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y, quantiles=quantiles)
if provider == 'triton':
ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y), quantiles=quantiles)
gbps = lambda ms: 3 * x.numel() * x.element_size() * 1e-9 / (ms * 1e-3)
return gbps(ms), gbps(max_ms), gbps(min_ms)


# %%
# We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
# `save_path='/path/to/results/' to save them to disk along with raw CSV data:
benchmark.run(print_data=True, show_plots=False)
Loading
Loading