From f13b9be988bd479e07fa74f9791bc09f9f1cecde Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 8 Feb 2026 19:02:43 -0800
Subject: [PATCH 1/3] Add CUTLASS v4.3.5 C++ headers to Modal runner image

Install CUTLASS C++ headers to /opt/cutlass so users can
#include <cutlass/...> and #include <cute/...> in their
submissions. Also adds a test script to validate the setup
before deploying, and documents how to add new C++ deps.
---
 src/runners/modal_runner.py       |  27 +++-
 src/runners/test_cutlass_image.py | 202 ++++++++++++++++++++++++++++++
 2 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 src/runners/test_cutlass_image.py
diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py
index 695c96d3..310fdb6d 100644
--- a/src/runners/modal_runner.py
+++ b/src/runners/modal_runner.py
@@ -14,7 +14,24 @@
 operating_sys = "ubuntu24.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
 
-# Move this to another file later:
+# === Image Definition ===
+#
+# Adding new C++ library dependencies:
+#   1. Add a .run_commands() step that installs headers to /opt/<library_name>
+#      Use `git clone --depth 1 --branch <tag>` for header-only libs to keep the image small.
+#   2. Add the include paths to CPLUS_INCLUDE_PATH in the .env() block at the bottom
+#      so that nvcc finds them automatically without -I flags.
+#   3. Test changes with test_cutlass_image.py (or a similar script) before deploying:
+#        cd src/runners && modal run test_cutlass_image.py
+#
+# For users writing submissions with torch.utils.cpp_extension.load_inline:
+#   C++ headers installed on the image (like CUTLASS) require explicit include paths:
+#     load_inline(
+#         ...
+#         extra_include_paths=["/opt/cutlass/include", "/opt/cutlass/tools/util/include"],
+#     )
+#   For raw nvcc compilation, CPLUS_INCLUDE_PATH is set so includes work automatically.
+#
 cuda_image = (
     Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13")
     .run_commands("ln -sf $(which python) /usr/local/bin/python3")
@@ -52,6 +69,14 @@
         # "nvmath-python[cu13]~=0.4",
         # "numba-cuda[cu13]~=0.15",
     )
+    # CUTLASS C++ headers for #include <cutlass/...>
+    .run_commands(
+        "git clone --depth 1 --branch v4.3.5 https://github.com/NVIDIA/cutlass.git /opt/cutlass",
+    )
+    .env({
+        "CUTLASS_PATH": "/opt/cutlass",
+        "CPLUS_INCLUDE_PATH": "/opt/cutlass/include:/opt/cutlass/tools/util/include",
+    })
 )
 
 cuda_image = cuda_image.add_local_python_source(
diff --git a/src/runners/test_cutlass_image.py b/src/runners/test_cutlass_image.py
new file mode 100644
index 00000000..0ae06497
--- /dev/null
+++ b/src/runners/test_cutlass_image.py
@@ -0,0 +1,202 @@
+"""Test script to verify CUTLASS C++ headers work on a Modal image.
+
+Usage:
+    cd src/runners
+    modal run test_cutlass_image.py
+
+This builds a test image with CUTLASS v4.3.5 headers and runs a simple
+compilation test on a GPU. Does NOT affect the production image.
+"""
+
+import modal
+
+app = modal.App("test-cutlass-image")
+
+cuda_version = "13.1.0"
+flavor = "devel"
+operating_sys = "ubuntu24.04"
+tag = f"{cuda_version}-{flavor}-{operating_sys}"
+
+test_image = (
+    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13")
+    .run_commands("ln -sf $(which python) /usr/local/bin/python3")
+    .apt_install("git", "gcc-13", "g++-13")
+    .uv_pip_install("ninja~=1.11")
+    .uv_pip_install(
+        "torch==2.9.1",
+        index_url="https://download.pytorch.org/whl/cu130",
+    )
+    # CUTLASS C++ headers
+    .run_commands(
+        "git clone --depth 1 --branch v4.3.5 https://github.com/NVIDIA/cutlass.git /opt/cutlass",
+    )
+    .env({
+        "CUTLASS_PATH": "/opt/cutlass",
+        "CPLUS_INCLUDE_PATH": "/opt/cutlass/include:/opt/cutlass/tools/util/include",
+    })
+)
+
+
+CUTLASS_TEST_CU = r"""
+#include <iostream>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <cute/tensor.hpp>
+
+int main() {
+    std::cout << "CUTLASS include works!" << std::endl;
+
+    // Test CuTe tensor layout (core CUTLASS 3.x/4.x API)
+    auto layout = cute::make_layout(cute::make_shape(4, 8));
+    std::cout << "CuTe layout size: " << cute::size(layout) << std::endl;
+
+    // Test CUTLASS numeric types
+    cutlass::half_t h = cutlass::half_t(3.14f);
+    std::cout << "half_t value: " << float(h) << std::endl;
+
+    std::cout << "All CUTLASS tests passed!" << std::endl;
+    return 0;
+}
+"""
+
+
+@app.function(gpu="T4", image=test_image, timeout=300)
+def test_cutlass():
+    import subprocess
+    import tempfile
+    import os
+
+    results = {}
+
+    # Test 1: Check that CUTLASS headers exist
+    cutlass_path = os.environ.get("CUTLASS_PATH", "")
+    header = os.path.join(cutlass_path, "include", "cutlass", "cutlass.h")
+    results["cutlass_path"] = cutlass_path
+    results["header_exists"] = os.path.exists(header)
+
+    # Test 2: Check CPLUS_INCLUDE_PATH is set
+    results["cplus_include_path"] = os.environ.get("CPLUS_INCLUDE_PATH", "NOT SET")
+
+    # Test 3: Compile and run a simple CUTLASS program
+    with tempfile.TemporaryDirectory() as tmpdir:
+        cu_file = os.path.join(tmpdir, "test_cutlass.cu")
+        binary = os.path.join(tmpdir, "test_cutlass")
+
+        with open(cu_file, "w") as f:
+            f.write(CUTLASS_TEST_CU)
+
+        compile_cmd = [
+            "nvcc",
+            cu_file,
+            "-o", binary,
+            "-I", f"{cutlass_path}/include",
+            "-I", f"{cutlass_path}/tools/util/include",
+            "-std=c++17",
+            "-arch=sm_75",
+        ]
+
+        compile_result = subprocess.run(
+            compile_cmd, capture_output=True, text=True
+        )
+        results["compile_returncode"] = compile_result.returncode
+        results["compile_stdout"] = compile_result.stdout
+        results["compile_stderr"] = compile_result.stderr
+
+        if compile_result.returncode == 0:
+            run_result = subprocess.run(
+                [binary], capture_output=True, text=True
+            )
+            results["run_returncode"] = run_result.returncode
+            results["run_stdout"] = run_result.stdout
+            results["run_stderr"] = run_result.stderr
+
+    # Test 4: Check that PyTorch CUDA extension loading works with CUTLASS
+    torch_cutlass_test = """
+import torch
+from torch.utils.cpp_extension import load_inline
+
+cuda_src = '''
+#include <cutlass/cutlass.h>
+#include <torch/extension.h>
+
+torch::Tensor check_cutlass(torch::Tensor x) {
+    // Just return input - proves cutlass headers are findable
+    return x;
+}
+'''
+
+cpp_src = "torch::Tensor check_cutlass(torch::Tensor x);"
+
+try:
+    mod = load_inline(
+        name="cutlass_check",
+        cpp_sources=[cpp_src],
+        cuda_sources=[cuda_src],
+        extra_include_paths=["/opt/cutlass/include", "/opt/cutlass/tools/util/include"],
+        functions=["check_cutlass"],
+        verbose=True,
+    )
+    t = torch.randn(4, device="cuda")
+    result = mod.check_cutlass(t)
+    print(f"SUCCESS: PyTorch inline CUDA extension with CUTLASS compiled and ran. Output shape: {result.shape}")
+except Exception as e:
+    print(f"FAILED: {e}")
+"""
+    torch_result = subprocess.run(
+        ["python", "-c", torch_cutlass_test],
+        capture_output=True, text=True
+    )
+    results["torch_extension_returncode"] = torch_result.returncode
+    results["torch_extension_stdout"] = torch_result.stdout
+    results["torch_extension_stderr"] = torch_result.stderr[-2000:] if len(torch_result.stderr) > 2000 else torch_result.stderr
+
+    return results
+
+
+@app.local_entrypoint()
+def main():
+    print("=" * 60)
+    print("Testing CUTLASS C++ headers on Modal (T4 GPU)")
+    print("=" * 60)
+
+    results = test_cutlass.remote()
+
+    print(f"\n--- Environment ---")
+    print(f"CUTLASS_PATH: {results['cutlass_path']}")
+    print(f"Header exists: {results['header_exists']}")
+    print(f"CPLUS_INCLUDE_PATH: {results['cplus_include_path']}")
+
+    print(f"\n--- nvcc Compilation Test ---")
+    print(f"Return code: {results['compile_returncode']}")
+    if results['compile_stdout']:
+        print(f"stdout: {results['compile_stdout']}")
+    if results['compile_stderr']:
+        print(f"stderr: {results['compile_stderr']}")
+
+    if results.get('run_returncode') is not None:
+        print(f"\n--- Run Test ---")
+        print(f"Return code: {results['run_returncode']}")
+        print(f"stdout: {results['run_stdout']}")
+        if results['run_stderr']:
+            print(f"stderr: {results['run_stderr']}")
+
+    print(f"\n--- PyTorch Inline Extension Test ---")
+    print(f"Return code: {results['torch_extension_returncode']}")
+    if results['torch_extension_stdout']:
+        print(f"stdout: {results['torch_extension_stdout']}")
+    if results['torch_extension_stderr']:
+        print(f"stderr (last 2000 chars): {results['torch_extension_stderr']}")
+
+    # Summary
+    print("\n" + "=" * 60)
+    all_pass = (
+        results['header_exists']
+        and results['compile_returncode'] == 0
+        and results.get('run_returncode') == 0
+        and results['torch_extension_returncode'] == 0
+    )
+    if all_pass:
+        print("ALL TESTS PASSED - safe to add CUTLASS to production image")
+    else:
+        print("SOME TESTS FAILED - check output above")
+    print("=" * 60)

From 052f6cd0eb04e516d78e90cf8e9be32da72a6ce4 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 8 Feb 2026 19:04:25 -0800
Subject: [PATCH 2/3] Remove test_cutlass_image.py after validating CUTLASS
 setup

---
 src/runners/test_cutlass_image.py | 202 ------------------------------
 1 file changed, 202 deletions(-)
 delete mode 100644 src/runners/test_cutlass_image.py

diff --git a/src/runners/test_cutlass_image.py b/src/runners/test_cutlass_image.py
deleted file mode 100644
index 0ae06497..00000000
--- a/src/runners/test_cutlass_image.py
+++ /dev/null
@@ -1,202 +0,0 @@
-"""Test script to verify CUTLASS C++ headers work on a Modal image.
-
-Usage:
-    cd src/runners
-    modal run test_cutlass_image.py
-
-This builds a test image with CUTLASS v4.3.5 headers and runs a simple
-compilation test on a GPU. Does NOT affect the production image.
-"""
-
-import modal
-
-app = modal.App("test-cutlass-image")
-
-cuda_version = "13.1.0"
-flavor = "devel"
-operating_sys = "ubuntu24.04"
-tag = f"{cuda_version}-{flavor}-{operating_sys}"
-
-test_image = (
-    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13")
-    .run_commands("ln -sf $(which python) /usr/local/bin/python3")
-    .apt_install("git", "gcc-13", "g++-13")
-    .uv_pip_install("ninja~=1.11")
-    .uv_pip_install(
-        "torch==2.9.1",
-        index_url="https://download.pytorch.org/whl/cu130",
-    )
-    # CUTLASS C++ headers
-    .run_commands(
-        "git clone --depth 1 --branch v4.3.5 https://github.com/NVIDIA/cutlass.git /opt/cutlass",
-    )
-    .env({
-        "CUTLASS_PATH": "/opt/cutlass",
-        "CPLUS_INCLUDE_PATH": "/opt/cutlass/include:/opt/cutlass/tools/util/include",
-    })
-)
-
-
-CUTLASS_TEST_CU = r"""
-#include <iostream>
-#include <cutlass/cutlass.h>
-#include <cutlass/numeric_types.h>
-#include <cute/tensor.hpp>
-
-int main() {
-    std::cout << "CUTLASS include works!" << std::endl;
-
-    // Test CuTe tensor layout (core CUTLASS 3.x/4.x API)
-    auto layout = cute::make_layout(cute::make_shape(4, 8));
-    std::cout << "CuTe layout size: " << cute::size(layout) << std::endl;
-
-    // Test CUTLASS numeric types
-    cutlass::half_t h = cutlass::half_t(3.14f);
-    std::cout << "half_t value: " << float(h) << std::endl;
-
-    std::cout << "All CUTLASS tests passed!" << std::endl;
-    return 0;
-}
-"""
-
-
-@app.function(gpu="T4", image=test_image, timeout=300)
-def test_cutlass():
-    import subprocess
-    import tempfile
-    import os
-
-    results = {}
-
-    # Test 1: Check that CUTLASS headers exist
-    cutlass_path = os.environ.get("CUTLASS_PATH", "")
-    header = os.path.join(cutlass_path, "include", "cutlass", "cutlass.h")
-    results["cutlass_path"] = cutlass_path
-    results["header_exists"] = os.path.exists(header)
-
-    # Test 2: Check CPLUS_INCLUDE_PATH is set
-    results["cplus_include_path"] = os.environ.get("CPLUS_INCLUDE_PATH", "NOT SET")
-
-    # Test 3: Compile and run a simple CUTLASS program
-    with tempfile.TemporaryDirectory() as tmpdir:
-        cu_file = os.path.join(tmpdir, "test_cutlass.cu")
-        binary = os.path.join(tmpdir, "test_cutlass")
-
-        with open(cu_file, "w") as f:
-            f.write(CUTLASS_TEST_CU)
-
-        compile_cmd = [
-            "nvcc",
-            cu_file,
-            "-o", binary,
-            "-I", f"{cutlass_path}/include",
-            "-I", f"{cutlass_path}/tools/util/include",
-            "-std=c++17",
-            "-arch=sm_75",
-        ]
-
-        compile_result = subprocess.run(
-            compile_cmd, capture_output=True, text=True
-        )
-        results["compile_returncode"] = compile_result.returncode
-        results["compile_stdout"] = compile_result.stdout
-        results["compile_stderr"] = compile_result.stderr
-
-        if compile_result.returncode == 0:
-            run_result = subprocess.run(
-                [binary], capture_output=True, text=True
-            )
-            results["run_returncode"] = run_result.returncode
-            results["run_stdout"] = run_result.stdout
-            results["run_stderr"] = run_result.stderr
-
-    # Test 4: Check that PyTorch CUDA extension loading works with CUTLASS
-    torch_cutlass_test = """
-import torch
-from torch.utils.cpp_extension import load_inline
-
-cuda_src = '''
-#include <cutlass/cutlass.h>
-#include <torch/extension.h>
-
-torch::Tensor check_cutlass(torch::Tensor x) {
-    // Just return input - proves cutlass headers are findable
-    return x;
-}
-'''
-
-cpp_src = "torch::Tensor check_cutlass(torch::Tensor x);"
-
-try:
-    mod = load_inline(
-        name="cutlass_check",
-        cpp_sources=[cpp_src],
-        cuda_sources=[cuda_src],
-        extra_include_paths=["/opt/cutlass/include", "/opt/cutlass/tools/util/include"],
-        functions=["check_cutlass"],
-        verbose=True,
-    )
-    t = torch.randn(4, device="cuda")
-    result = mod.check_cutlass(t)
-    print(f"SUCCESS: PyTorch inline CUDA extension with CUTLASS compiled and ran. Output shape: {result.shape}")
-except Exception as e:
-    print(f"FAILED: {e}")
-"""
-    torch_result = subprocess.run(
-        ["python", "-c", torch_cutlass_test],
-        capture_output=True, text=True
-    )
-    results["torch_extension_returncode"] = torch_result.returncode
-    results["torch_extension_stdout"] = torch_result.stdout
-    results["torch_extension_stderr"] = torch_result.stderr[-2000:] if len(torch_result.stderr) > 2000 else torch_result.stderr
-
-    return results
-
-
-@app.local_entrypoint()
-def main():
-    print("=" * 60)
-    print("Testing CUTLASS C++ headers on Modal (T4 GPU)")
-    print("=" * 60)
-
-    results = test_cutlass.remote()
-
-    print(f"\n--- Environment ---")
-    print(f"CUTLASS_PATH: {results['cutlass_path']}")
-    print(f"Header exists: {results['header_exists']}")
-    print(f"CPLUS_INCLUDE_PATH: {results['cplus_include_path']}")
-
-    print(f"\n--- nvcc Compilation Test ---")
-    print(f"Return code: {results['compile_returncode']}")
-    if results['compile_stdout']:
-        print(f"stdout: {results['compile_stdout']}")
-    if results['compile_stderr']:
-        print(f"stderr: {results['compile_stderr']}")
-
-    if results.get('run_returncode') is not None:
-        print(f"\n--- Run Test ---")
-        print(f"Return code: {results['run_returncode']}")
-        print(f"stdout: {results['run_stdout']}")
-        if results['run_stderr']:
-            print(f"stderr: {results['run_stderr']}")
-
-    print(f"\n--- PyTorch Inline Extension Test ---")
-    print(f"Return code: {results['torch_extension_returncode']}")
-    if results['torch_extension_stdout']:
-        print(f"stdout: {results['torch_extension_stdout']}")
-    if results['torch_extension_stderr']:
-        print(f"stderr (last 2000 chars): {results['torch_extension_stderr']}")
-
-    # Summary
-    print("\n" + "=" * 60)
-    all_pass = (
-        results['header_exists']
-        and results['compile_returncode'] == 0
-        and results.get('run_returncode') == 0
-        and results['torch_extension_returncode'] == 0
-    )
-    if all_pass:
-        print("ALL TESTS PASSED - safe to add CUTLASS to production image")
-    else:
-        print("SOME TESTS FAILED - check output above")
-    print("=" * 60)

From aaa8459fbc29130bb0770bd64ed980d6d7b91908 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 8 Feb 2026 19:04:59 -0800
Subject: [PATCH 3/3] Remove Claude GitHub Actions workflows

---
 .github/workflows/claude-code-review.yml | 44 --------------------
 .github/workflows/claude.yml             | 51 ------------------------
 2 files changed, 95 deletions(-)
 delete mode 100644 .github/workflows/claude-code-review.yml
 delete mode 100644 .github/workflows/claude.yml

diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml
deleted file mode 100644
index c2d320c5..00000000
--- a/.github/workflows/claude-code-review.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Claude Code Review
-
-on:
-  pull_request:
-    types: [opened, synchronize, ready_for_review, reopened]
-    # Optional: Only run on specific file changes
-    # paths:
-    #   - "src/**/*.ts"
-    #   - "src/**/*.tsx"
-    #   - "src/**/*.js"
-    #   - "src/**/*.jsx"
-
-jobs:
-  claude-review:
-    # Optional: Filter by PR author
-    # if: |
-    #   github.event.pull_request.user.login == 'external-contributor' ||
-    #   github.event.pull_request.user.login == 'new-developer' ||
-    #   github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR'
-
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: write
-      issues: read
-      id-token: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 1
-
-      - name: Run Claude Code Review
-        id: claude-review
-        uses: anthropics/claude-code-action@v1
-        with:
-          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
-          plugin_marketplaces: 'https://github.com/anthropics/claude-code.git'
-          plugins: 'code-review@claude-code-plugins'
-          prompt: '/code-review:code-review ${{ github.repository }}/pull/${{ github.event.pull_request.number }}'
-          # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md
-          # or https://code.claude.com/docs/en/cli-reference for available options
-
diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
deleted file mode 100644
index fef98739..00000000
--- a/.github/workflows/claude.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: Claude Code
-
-on:
-  issue_comment:
-    types: [created]
-  pull_request_review_comment:
-    types: [created]
-  issues:
-    types: [opened, assigned]
-  pull_request_review:
-    types: [submitted]
-
-jobs:
-  claude:
-    if: |
-      (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
-      (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
-      (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
-      (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: write
-      issues: write
-      id-token: write
-      actions: read # Required for Claude to read CI results on PRs
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 1
-
-      - name: Run Claude Code
-        id: claude
-        uses: anthropics/claude-code-action@v1
-        with:
-          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
-          show_full_output: true
-
-          # This is an optional setting that allows Claude to read CI results on PRs
-          additional_permissions: |
-            actions: read
-
-          # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it.
-          # prompt: 'Update the pull request description to include a summary of changes.'
-
-          # Optional: Add claude_args to customize behavior and configuration
-          # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md
-          # or https://code.claude.com/docs/en/cli-reference for available options
-          # claude_args: '--allowed-tools Bash(gh pr:*)'
-