From f13b9be988bd479e07fa74f9791bc09f9f1cecde Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 8 Feb 2026 19:02:43 -0800 Subject: [PATCH 1/3] Add CUTLASS v4.3.5 C++ headers to Modal runner image Install CUTLASS C++ headers to /opt/cutlass so users can #include and #include in their submissions. Also adds a test script to validate the setup before deploying, and documents how to add new C++ deps. --- src/runners/modal_runner.py | 27 +++- src/runners/test_cutlass_image.py | 202 ++++++++++++++++++++++++++++++ 2 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 src/runners/test_cutlass_image.py diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py index 695c96d3..310fdb6d 100644 --- a/src/runners/modal_runner.py +++ b/src/runners/modal_runner.py @@ -14,7 +14,24 @@ operating_sys = "ubuntu24.04" tag = f"{cuda_version}-{flavor}-{operating_sys}" -# Move this to another file later: +# === Image Definition === +# +# Adding new C++ library dependencies: +# 1. Add a .run_commands() step that installs headers to /opt/ +# Use `git clone --depth 1 --branch ` for header-only libs to keep the image small. +# 2. Add the include paths to CPLUS_INCLUDE_PATH in the .env() block at the bottom +# so that nvcc finds them automatically without -I flags. +# 3. Test changes with test_cutlass_image.py (or a similar script) before deploying: +# cd src/runners && modal run test_cutlass_image.py +# +# For users writing submissions with torch.utils.cpp_extension.load_inline: +# C++ headers installed on the image (like CUTLASS) require explicit include paths: +# load_inline( +# ... +# extra_include_paths=["/opt/cutlass/include", "/opt/cutlass/tools/util/include"], +# ) +# For raw nvcc compilation, CPLUS_INCLUDE_PATH is set so includes work automatically. +# cuda_image = ( Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13") .run_commands("ln -sf $(which python) /usr/local/bin/python3") @@ -52,6 +69,14 @@ # "nvmath-python[cu13]~=0.4", # "numba-cuda[cu13]~=0.15", ) + # CUTLASS C++ headers for #include + .run_commands( + "git clone --depth 1 --branch v4.3.5 https://github.com/NVIDIA/cutlass.git /opt/cutlass", + ) + .env({ + "CUTLASS_PATH": "/opt/cutlass", + "CPLUS_INCLUDE_PATH": "/opt/cutlass/include:/opt/cutlass/tools/util/include", + }) ) cuda_image = cuda_image.add_local_python_source( diff --git a/src/runners/test_cutlass_image.py b/src/runners/test_cutlass_image.py new file mode 100644 index 00000000..0ae06497 --- /dev/null +++ b/src/runners/test_cutlass_image.py @@ -0,0 +1,202 @@ +"""Test script to verify CUTLASS C++ headers work on a Modal image. + +Usage: + cd src/runners + modal run test_cutlass_image.py + +This builds a test image with CUTLASS v4.3.5 headers and runs a simple +compilation test on a GPU. Does NOT affect the production image. +""" + +import modal + +app = modal.App("test-cutlass-image") + +cuda_version = "13.1.0" +flavor = "devel" +operating_sys = "ubuntu24.04" +tag = f"{cuda_version}-{flavor}-{operating_sys}" + +test_image = ( + modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13") + .run_commands("ln -sf $(which python) /usr/local/bin/python3") + .apt_install("git", "gcc-13", "g++-13") + .uv_pip_install("ninja~=1.11") + .uv_pip_install( + "torch==2.9.1", + index_url="https://download.pytorch.org/whl/cu130", + ) + # CUTLASS C++ headers + .run_commands( + "git clone --depth 1 --branch v4.3.5 https://github.com/NVIDIA/cutlass.git /opt/cutlass", + ) + .env({ + "CUTLASS_PATH": "/opt/cutlass", + "CPLUS_INCLUDE_PATH": "/opt/cutlass/include:/opt/cutlass/tools/util/include", + }) +) + + +CUTLASS_TEST_CU = r""" +#include +#include +#include +#include + +int main() { + std::cout << "CUTLASS include works!" << std::endl; + + // Test CuTe tensor layout (core CUTLASS 3.x/4.x API) + auto layout = cute::make_layout(cute::make_shape(4, 8)); + std::cout << "CuTe layout size: " << cute::size(layout) << std::endl; + + // Test CUTLASS numeric types + cutlass::half_t h = cutlass::half_t(3.14f); + std::cout << "half_t value: " << float(h) << std::endl; + + std::cout << "All CUTLASS tests passed!" << std::endl; + return 0; +} +""" + + +@app.function(gpu="T4", image=test_image, timeout=300) +def test_cutlass(): + import subprocess + import tempfile + import os + + results = {} + + # Test 1: Check that CUTLASS headers exist + cutlass_path = os.environ.get("CUTLASS_PATH", "") + header = os.path.join(cutlass_path, "include", "cutlass", "cutlass.h") + results["cutlass_path"] = cutlass_path + results["header_exists"] = os.path.exists(header) + + # Test 2: Check CPLUS_INCLUDE_PATH is set + results["cplus_include_path"] = os.environ.get("CPLUS_INCLUDE_PATH", "NOT SET") + + # Test 3: Compile and run a simple CUTLASS program + with tempfile.TemporaryDirectory() as tmpdir: + cu_file = os.path.join(tmpdir, "test_cutlass.cu") + binary = os.path.join(tmpdir, "test_cutlass") + + with open(cu_file, "w") as f: + f.write(CUTLASS_TEST_CU) + + compile_cmd = [ + "nvcc", + cu_file, + "-o", binary, + "-I", f"{cutlass_path}/include", + "-I", f"{cutlass_path}/tools/util/include", + "-std=c++17", + "-arch=sm_75", + ] + + compile_result = subprocess.run( + compile_cmd, capture_output=True, text=True + ) + results["compile_returncode"] = compile_result.returncode + results["compile_stdout"] = compile_result.stdout + results["compile_stderr"] = compile_result.stderr + + if compile_result.returncode == 0: + run_result = subprocess.run( + [binary], capture_output=True, text=True + ) + results["run_returncode"] = run_result.returncode + results["run_stdout"] = run_result.stdout + results["run_stderr"] = run_result.stderr + + # Test 4: Check that PyTorch CUDA extension loading works with CUTLASS + torch_cutlass_test = """ +import torch +from torch.utils.cpp_extension import load_inline + +cuda_src = ''' +#include +#include + +torch::Tensor check_cutlass(torch::Tensor x) { + // Just return input - proves cutlass headers are findable + return x; +} +''' + +cpp_src = "torch::Tensor check_cutlass(torch::Tensor x);" + +try: + mod = load_inline( + name="cutlass_check", + cpp_sources=[cpp_src], + cuda_sources=[cuda_src], + extra_include_paths=["/opt/cutlass/include", "/opt/cutlass/tools/util/include"], + functions=["check_cutlass"], + verbose=True, + ) + t = torch.randn(4, device="cuda") + result = mod.check_cutlass(t) + print(f"SUCCESS: PyTorch inline CUDA extension with CUTLASS compiled and ran. Output shape: {result.shape}") +except Exception as e: + print(f"FAILED: {e}") +""" + torch_result = subprocess.run( + ["python", "-c", torch_cutlass_test], + capture_output=True, text=True + ) + results["torch_extension_returncode"] = torch_result.returncode + results["torch_extension_stdout"] = torch_result.stdout + results["torch_extension_stderr"] = torch_result.stderr[-2000:] if len(torch_result.stderr) > 2000 else torch_result.stderr + + return results + + +@app.local_entrypoint() +def main(): + print("=" * 60) + print("Testing CUTLASS C++ headers on Modal (T4 GPU)") + print("=" * 60) + + results = test_cutlass.remote() + + print(f"\n--- Environment ---") + print(f"CUTLASS_PATH: {results['cutlass_path']}") + print(f"Header exists: {results['header_exists']}") + print(f"CPLUS_INCLUDE_PATH: {results['cplus_include_path']}") + + print(f"\n--- nvcc Compilation Test ---") + print(f"Return code: {results['compile_returncode']}") + if results['compile_stdout']: + print(f"stdout: {results['compile_stdout']}") + if results['compile_stderr']: + print(f"stderr: {results['compile_stderr']}") + + if results.get('run_returncode') is not None: + print(f"\n--- Run Test ---") + print(f"Return code: {results['run_returncode']}") + print(f"stdout: {results['run_stdout']}") + if results['run_stderr']: + print(f"stderr: {results['run_stderr']}") + + print(f"\n--- PyTorch Inline Extension Test ---") + print(f"Return code: {results['torch_extension_returncode']}") + if results['torch_extension_stdout']: + print(f"stdout: {results['torch_extension_stdout']}") + if results['torch_extension_stderr']: + print(f"stderr (last 2000 chars): {results['torch_extension_stderr']}") + + # Summary + print("\n" + "=" * 60) + all_pass = ( + results['header_exists'] + and results['compile_returncode'] == 0 + and results.get('run_returncode') == 0 + and results['torch_extension_returncode'] == 0 + ) + if all_pass: + print("ALL TESTS PASSED - safe to add CUTLASS to production image") + else: + print("SOME TESTS FAILED - check output above") + print("=" * 60) From 052f6cd0eb04e516d78e90cf8e9be32da72a6ce4 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 8 Feb 2026 19:04:25 -0800 Subject: [PATCH 2/3] Remove test_cutlass_image.py after validating CUTLASS setup --- src/runners/test_cutlass_image.py | 202 ------------------------------ 1 file changed, 202 deletions(-) delete mode 100644 src/runners/test_cutlass_image.py diff --git a/src/runners/test_cutlass_image.py b/src/runners/test_cutlass_image.py deleted file mode 100644 index 0ae06497..00000000 --- a/src/runners/test_cutlass_image.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Test script to verify CUTLASS C++ headers work on a Modal image. - -Usage: - cd src/runners - modal run test_cutlass_image.py - -This builds a test image with CUTLASS v4.3.5 headers and runs a simple -compilation test on a GPU. Does NOT affect the production image. -""" - -import modal - -app = modal.App("test-cutlass-image") - -cuda_version = "13.1.0" -flavor = "devel" -operating_sys = "ubuntu24.04" -tag = f"{cuda_version}-{flavor}-{operating_sys}" - -test_image = ( - modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13") - .run_commands("ln -sf $(which python) /usr/local/bin/python3") - .apt_install("git", "gcc-13", "g++-13") - .uv_pip_install("ninja~=1.11") - .uv_pip_install( - "torch==2.9.1", - index_url="https://download.pytorch.org/whl/cu130", - ) - # CUTLASS C++ headers - .run_commands( - "git clone --depth 1 --branch v4.3.5 https://github.com/NVIDIA/cutlass.git /opt/cutlass", - ) - .env({ - "CUTLASS_PATH": "/opt/cutlass", - "CPLUS_INCLUDE_PATH": "/opt/cutlass/include:/opt/cutlass/tools/util/include", - }) -) - - -CUTLASS_TEST_CU = r""" -#include -#include -#include -#include - -int main() { - std::cout << "CUTLASS include works!" << std::endl; - - // Test CuTe tensor layout (core CUTLASS 3.x/4.x API) - auto layout = cute::make_layout(cute::make_shape(4, 8)); - std::cout << "CuTe layout size: " << cute::size(layout) << std::endl; - - // Test CUTLASS numeric types - cutlass::half_t h = cutlass::half_t(3.14f); - std::cout << "half_t value: " << float(h) << std::endl; - - std::cout << "All CUTLASS tests passed!" << std::endl; - return 0; -} -""" - - -@app.function(gpu="T4", image=test_image, timeout=300) -def test_cutlass(): - import subprocess - import tempfile - import os - - results = {} - - # Test 1: Check that CUTLASS headers exist - cutlass_path = os.environ.get("CUTLASS_PATH", "") - header = os.path.join(cutlass_path, "include", "cutlass", "cutlass.h") - results["cutlass_path"] = cutlass_path - results["header_exists"] = os.path.exists(header) - - # Test 2: Check CPLUS_INCLUDE_PATH is set - results["cplus_include_path"] = os.environ.get("CPLUS_INCLUDE_PATH", "NOT SET") - - # Test 3: Compile and run a simple CUTLASS program - with tempfile.TemporaryDirectory() as tmpdir: - cu_file = os.path.join(tmpdir, "test_cutlass.cu") - binary = os.path.join(tmpdir, "test_cutlass") - - with open(cu_file, "w") as f: - f.write(CUTLASS_TEST_CU) - - compile_cmd = [ - "nvcc", - cu_file, - "-o", binary, - "-I", f"{cutlass_path}/include", - "-I", f"{cutlass_path}/tools/util/include", - "-std=c++17", - "-arch=sm_75", - ] - - compile_result = subprocess.run( - compile_cmd, capture_output=True, text=True - ) - results["compile_returncode"] = compile_result.returncode - results["compile_stdout"] = compile_result.stdout - results["compile_stderr"] = compile_result.stderr - - if compile_result.returncode == 0: - run_result = subprocess.run( - [binary], capture_output=True, text=True - ) - results["run_returncode"] = run_result.returncode - results["run_stdout"] = run_result.stdout - results["run_stderr"] = run_result.stderr - - # Test 4: Check that PyTorch CUDA extension loading works with CUTLASS - torch_cutlass_test = """ -import torch -from torch.utils.cpp_extension import load_inline - -cuda_src = ''' -#include -#include - -torch::Tensor check_cutlass(torch::Tensor x) { - // Just return input - proves cutlass headers are findable - return x; -} -''' - -cpp_src = "torch::Tensor check_cutlass(torch::Tensor x);" - -try: - mod = load_inline( - name="cutlass_check", - cpp_sources=[cpp_src], - cuda_sources=[cuda_src], - extra_include_paths=["/opt/cutlass/include", "/opt/cutlass/tools/util/include"], - functions=["check_cutlass"], - verbose=True, - ) - t = torch.randn(4, device="cuda") - result = mod.check_cutlass(t) - print(f"SUCCESS: PyTorch inline CUDA extension with CUTLASS compiled and ran. Output shape: {result.shape}") -except Exception as e: - print(f"FAILED: {e}") -""" - torch_result = subprocess.run( - ["python", "-c", torch_cutlass_test], - capture_output=True, text=True - ) - results["torch_extension_returncode"] = torch_result.returncode - results["torch_extension_stdout"] = torch_result.stdout - results["torch_extension_stderr"] = torch_result.stderr[-2000:] if len(torch_result.stderr) > 2000 else torch_result.stderr - - return results - - -@app.local_entrypoint() -def main(): - print("=" * 60) - print("Testing CUTLASS C++ headers on Modal (T4 GPU)") - print("=" * 60) - - results = test_cutlass.remote() - - print(f"\n--- Environment ---") - print(f"CUTLASS_PATH: {results['cutlass_path']}") - print(f"Header exists: {results['header_exists']}") - print(f"CPLUS_INCLUDE_PATH: {results['cplus_include_path']}") - - print(f"\n--- nvcc Compilation Test ---") - print(f"Return code: {results['compile_returncode']}") - if results['compile_stdout']: - print(f"stdout: {results['compile_stdout']}") - if results['compile_stderr']: - print(f"stderr: {results['compile_stderr']}") - - if results.get('run_returncode') is not None: - print(f"\n--- Run Test ---") - print(f"Return code: {results['run_returncode']}") - print(f"stdout: {results['run_stdout']}") - if results['run_stderr']: - print(f"stderr: {results['run_stderr']}") - - print(f"\n--- PyTorch Inline Extension Test ---") - print(f"Return code: {results['torch_extension_returncode']}") - if results['torch_extension_stdout']: - print(f"stdout: {results['torch_extension_stdout']}") - if results['torch_extension_stderr']: - print(f"stderr (last 2000 chars): {results['torch_extension_stderr']}") - - # Summary - print("\n" + "=" * 60) - all_pass = ( - results['header_exists'] - and results['compile_returncode'] == 0 - and results.get('run_returncode') == 0 - and results['torch_extension_returncode'] == 0 - ) - if all_pass: - print("ALL TESTS PASSED - safe to add CUTLASS to production image") - else: - print("SOME TESTS FAILED - check output above") - print("=" * 60) From aaa8459fbc29130bb0770bd64ed980d6d7b91908 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 8 Feb 2026 19:04:59 -0800 Subject: [PATCH 3/3] Remove Claude GitHub Actions workflows --- .github/workflows/claude-code-review.yml | 44 -------------------- .github/workflows/claude.yml | 51 ------------------------ 2 files changed, 95 deletions(-) delete mode 100644 .github/workflows/claude-code-review.yml delete mode 100644 .github/workflows/claude.yml diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml deleted file mode 100644 index c2d320c5..00000000 --- a/.github/workflows/claude-code-review.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: Claude Code Review - -on: - pull_request: - types: [opened, synchronize, ready_for_review, reopened] - # Optional: Only run on specific file changes - # paths: - # - "src/**/*.ts" - # - "src/**/*.tsx" - # - "src/**/*.js" - # - "src/**/*.jsx" - -jobs: - claude-review: - # Optional: Filter by PR author - # if: | - # github.event.pull_request.user.login == 'external-contributor' || - # github.event.pull_request.user.login == 'new-developer' || - # github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' - - runs-on: ubuntu-latest - permissions: - contents: read - pull-requests: write - issues: read - id-token: write - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: Run Claude Code Review - id: claude-review - uses: anthropics/claude-code-action@v1 - with: - anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} - plugin_marketplaces: 'https://github.com/anthropics/claude-code.git' - plugins: 'code-review@claude-code-plugins' - prompt: '/code-review:code-review ${{ github.repository }}/pull/${{ github.event.pull_request.number }}' - # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md - # or https://code.claude.com/docs/en/cli-reference for available options - diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml deleted file mode 100644 index fef98739..00000000 --- a/.github/workflows/claude.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: Claude Code - -on: - issue_comment: - types: [created] - pull_request_review_comment: - types: [created] - issues: - types: [opened, assigned] - pull_request_review: - types: [submitted] - -jobs: - claude: - if: | - (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || - (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || - (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || - (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) - runs-on: ubuntu-latest - permissions: - contents: read - pull-requests: write - issues: write - id-token: write - actions: read # Required for Claude to read CI results on PRs - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: Run Claude Code - id: claude - uses: anthropics/claude-code-action@v1 - with: - anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} - show_full_output: true - - # This is an optional setting that allows Claude to read CI results on PRs - additional_permissions: | - actions: read - - # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it. - # prompt: 'Update the pull request description to include a summary of changes.' - - # Optional: Add claude_args to customize behavior and configuration - # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md - # or https://code.claude.com/docs/en/cli-reference for available options - # claude_args: '--allowed-tools Bash(gh pr:*)' -