Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added events/sca_2026/MathDx_Ozaki-I_Tutorial.pdf
Binary file not shown.
67 changes: 67 additions & 0 deletions tutorials/floating-point-emulation/brev/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: &tutorial-name floating-point-emulation

x-config:
dockerfile: &dockerfile tutorials/floating-point-emulation/brev/dockerfile
image: &image ghcr.io/nvidia/floating-point-emulation-tutorial:latest
working-dir: &working-dir /accelerated-computing-hub/tutorials/floating-point-emulation/notebooks
large: &large true
default-jupyter-url: &default-jupyter-url
gpu-config: &gpu-config
privileged: true
ulimits:
memlock: -1
stack: 67108864
shm_size: 1g
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
common-service: &common-service
pull_policy: missing
volumes:
- accelerated-computing-hub:/accelerated-computing-hub
- /var/run/docker.sock:/var/run/docker.sock
environment:
BREV_ENV_ID: ${BREV_ENV_ID:-}
ACH_TUTORIAL: *tutorial-name
ACH_RUN_TESTS: ${ACH_RUN_TESTS:-}
ACH_USER: ${ACH_USER:-ach}
ACH_UID: ${ACH_UID:-1000}
ACH_GID: ${ACH_GID:-1000}
user: root
working_dir: *working-dir
persistent-service: &persistent-service
depends_on:
base:
condition: service_completed_successfully
restart: unless-stopped

services:
base:
<<: [*gpu-config, *common-service]
image: *image
entrypoint: ["/accelerated-computing-hub/brev/entrypoint.bash", "base"]
build:
context: ../../..
dockerfile: *dockerfile
restart: "no"
jupyter:
<<: [*gpu-config, *common-service, *persistent-service]
image: *image
entrypoint: ["/accelerated-computing-hub/brev/entrypoint.bash", "jupyter"]
command: *default-jupyter-url
ports:
- "127.0.0.1:8888:8888" # JupyterLab
nsight:
<<: [*gpu-config, *common-service, *persistent-service]
image: nvcr.io/nvidia/devtools/nsight-streamer-nsys:2025.3.1
entrypoint: ["/accelerated-computing-hub/brev/entrypoint.bash", "nsight"]
ports:
- "127.0.0.1:8080:8080" # HTTP
- "127.0.0.1:3478:3478" # TURN

volumes:
accelerated-computing-hub:
84 changes: 84 additions & 0 deletions tutorials/floating-point-emulation/brev/dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
FROM nvidia/cuda:13.1.0-base-ubuntu24.04

ENV PIP_ROOT_USER_ACTION=ignore \
ACH_TUTORIAL=floating-point-emulation \
BASH_ENV=/accelerated-computing-hub/brev/user-setup.bash

# Install CUDA Toolkit + build tools
RUN apt update -y \
&& apt install -y --no-install-recommends wget curl gnupg gosu lsb-release sudo \
&& wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \
&& echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ noble main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null \
&& curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \
&& apt update -y \
&& apt install -y cuda-nvrtc-13-1 cuda-cccl-13-1 libcublas-dev-13-1 \
libnvjitlink-13-1 cuda-cudart-13-1 cuda-nvcc-13-1 libnvvm-13-1 \
python-is-python3 python3-venv \
build-essential cmake \
git git-lfs \
docker-ce \
docker-ce-cli \
containerd.io \
docker-buildx-plugin \
docker-compose-plugin \
&& apt-get clean -y

# Install MathDx
RUN wget https://developer.nvidia.com/downloads/compute/cublasdx/redist/cublasdx/cuda13/nvidia-mathdx-25.12.1-cuda13.tar.gz \
&& tar -xvf nvidia-mathdx-25.12.1-cuda13.tar.gz \
&& rm nvidia-mathdx-25.12.1-cuda13.tar.gz \
&& mkdir -p /opt/nvidia \
&& mv nvidia-mathdx-25.12.1-cuda13/nvidia/mathdx /opt/nvidia/mathdx \
&& rm -rf nvidia-mathdx-25.12.1-cuda13

# Install libmathdx
RUN wget https://developer.nvidia.com/downloads/compute/cublasdx/redist/cublasdx/cuda13/libmathdx-Linux-x86_64-0.3.1-cuda13.0.tar.gz \
&& mkdir -p /opt/nvidia/libmathdx \
&& tar -xvf libmathdx-Linux-x86_64-0.3.1-cuda13.0.tar.gz -C /opt/nvidia/libmathdx \
&& rm libmathdx-Linux-x86_64-0.3.1-cuda13.0.tar.gz

# Install python
RUN python -m venv /opt/venv
ENV CUDA_PATH=/usr/local/cuda-13.1 \
PATH="/opt/venv/bin:$PATH" \
LD_LIBRARY_PATH="/opt/nvidia/libmathdx/lib:$LD_LIBRARY_PATH"

COPY tutorials/${ACH_TUTORIAL}/brev/requirements.txt /opt/requirements.txt

RUN set -ex \
&& `# Install Python packages` \
&& pip install --no-cache-dir -r /opt/requirements.txt \
&& rm /opt/requirements.txt

RUN set -ex \
&& `# Setup JupyterLab` \
&& mkdir -p ~/.jupyter \
&& ln -fs /accelerated-computing-hub/brev/jupyter-server-config.py ~/.jupyter/jupyter_server_config.py \
&& mkdir -p ~/.ipython/profile_default/startup \
&& ln -fs /accelerated-computing-hub/brev/ipython-startup-add-cwd-to-path.py ~/.ipython/profile_default/startup/00-add-cwd-to-path.py \
&& python -m jupyter labextension disable "@jupyterlab/apputils-extension:announcements"

# Enable passwordless sudo for all users and pass through environment and path
RUN echo 'ALL ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers \
&& sed -i -e 's/^Defaults\s*env_reset/#&/' -e 's/^Defaults\s*secure_path=/#&/' /etc/sudoers

COPY . /accelerated-computing-hub

# Ensure accelerated-computing-hub directory is writable by any user and setup shell initialization
RUN chmod -R a+rwX /accelerated-computing-hub \
&& mkdir -p /accelerated-computing-hub/logs \
&& chmod 777 /accelerated-computing-hub/logs \
&& ln -s /accelerated-computing-hub/brev/user-setup.bash /etc/profile.d/ach-user-setup.sh \
&& echo 'source /accelerated-computing-hub/brev/user-setup.bash' >> /etc/bash.bashrc

WORKDIR /accelerated-computing-hub/tutorials/${ACH_TUTORIAL}/notebooks

# Setup Git.
RUN git config --unset-all "http.https://github.com/.extraheader" || { code=$?; [ "$code" = 5 ] || exit "$code"; } \
&& git config --global --add safe.directory "/accelerated-computing-hub"

# Set default user to ach (can be overriden with docker run --user)
USER ach

ENTRYPOINT ["/accelerated-computing-hub/brev/entrypoint.bash", "jupyter"]
24 changes: 24 additions & 0 deletions tutorials/floating-point-emulation/brev/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# NVMATH + CTK 13.1 + CCCL + cublas
nvmath-python[dx]==0.8.*
cuda-core
cuda-bindings==13.1.*
cuda-cccl

# Scientific
numpy
scipy
ssgetpy
cupy-cuda13x

# Visualization
matplotlib

# Jupyter
jupyterlab
jupyterlab-nvidia-nsight
jupyterlab-execute-time
ipywidgets
ipykernel

# MPI
mpi4py
3 changes: 3 additions & 0 deletions tutorials/floating-point-emulation/brev/test.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#! /bin/bash

nvidia-smi
67 changes: 67 additions & 0 deletions tutorials/floating-point-emulation/cmake/common.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Global CXX flags/options
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
enable_testing()

LIST(APPEND CMAKE_PROGRAM_PATH "/usr/local/cuda-13.1/bin")

# Set default arguments
set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a")
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
endif()

# Find cuBLASDx
message(CHECK_START "Example Wrapper: Looking for MathDx package")
find_package(mathdx REQUIRED CONFIG
PATHS
"/opt/nvidia/mathdx/25.12"
)

find_package(CUDAToolkit REQUIRED)

if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "")
message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a")
endif()

if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$")
message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a")
endif()

string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}")

set(TUTORIAL_SM "${CMAKE_MATCH_1}0")
set(TUTORIAL_SM_LETTER "${CMAKE_MATCH_2}") # will be empty if no letter

if(TUTORIAL_SM_LETTER STREQUAL "")
# Case: no letter
set(TUTORIAL_SM_MODIFIER "cublasdx::generic")

elseif(TUTORIAL_SM_LETTER STREQUAL "a")
# Case: letter 'a'
set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific")

elseif(TUTORIAL_SM_LETTER STREQUAL "f")
# Case: letter 'f'
set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific")

else()
mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.")
endif()

set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}")

add_library(helpers INTERFACE)
target_include_directories(helpers INTERFACE include/)

function(add_tutorial tutorial_name tutorial_file)
add_executable("${tutorial_name}" "${tutorial_file}")
add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}")
target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM})
target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER})
target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas)
target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx)
target_link_libraries("${tutorial_name}" PRIVATE helpers)
target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr")
endfunction()
66 changes: 66 additions & 0 deletions tutorials/floating-point-emulation/cmake/tutorial.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Global CXX flags/options
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
enable_testing()

# Set default arguments
set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a")
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
endif()

# Find cuBLASDx
message(CHECK_START "Example Wrapper: Looking for MathDx package")
find_package(mathdx REQUIRED CONFIG
PATHS
"/opt/nvidia/mathdx/25.12"
)

find_package(CUDAToolkit REQUIRED)

if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "")
message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a")
endif()

if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$")
message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a")
endif()

string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}")

set(TUTORIAL_SM "${CMAKE_MATCH_1}0")
set(TUTORIAL_SM_LETTER "${CMAKE_MATCH_2}") # will be empty if no letter

if(TUTORIAL_SM_LETTER STREQUAL "")
# Case: no letter
set(TUTORIAL_SM_MODIFIER "cublasdx::generic")

elseif(TUTORIAL_SM_LETTER STREQUAL "a")
# Case: letter 'a'
set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific")

elseif(TUTORIAL_SM_LETTER STREQUAL "f")
# Case: letter 'f'
set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific")

else()
mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.")
endif()

set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}")

if(NOT TARGET tutorial_helpers)
message( FATAL_ERROR "Please add tutorial_helpers library before including tutorial.cmake" )
endif()

function(add_tutorial tutorial_name tutorial_file)
add_executable("${tutorial_name}" "${tutorial_file}")
add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}")
target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM})
target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER})
target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas)
target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx)
target_link_libraries("${tutorial_name}" PRIVATE tutorial_helpers)
target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr")
endfunction()
20 changes: 20 additions & 0 deletions tutorials/floating-point-emulation/cpp_source/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
cmake_minimum_required(VERSION 4.0)

LIST(APPEND CMAKE_PROGRAM_PATH "/usr/local/cuda-13.1/bin")
project(cublasdx-dgemm-tutorial VERSION 0.1 LANGUAGES CUDA CXX)

# Add header tutorial helper files
add_library(tutorial_helpers INTERFACE)
target_include_directories(tutorial_helpers INTERFACE include/)

include(../cmake/common.cmake)

add_tutorial(1a_simple_dgemm_tensor src/1a_simple_dgemm_tensor.cu)
add_tutorial(1b_simple_dgemm_shared src/1b_simple_dgemm_shared.cu)
add_tutorial(1c_simple_dgemm_cublasdx src/1c_simple_dgemm_cublasdx.cu)
add_tutorial(1d_simple_pipelined_dgemm src/1d_simple_pipelined_dgemm.cu)
add_tutorial(2a_unfused_emulation src/2a_unfused_emulation/dgemm_emulation.cu)
add_tutorial(2b_partially_fused_emulation src/2b_partially_fused_emulation/dgemm_emulation.cu)
add_tutorial(2c_fully_fused_emulation src/2c_fully_fused_emulation/dgemm_emulation.cu)
add_tutorial(3a_fused_syrk_emulation src/3a_fused_syrk_emulation/syrk_emulation.cu)

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#ifndef CUDA_CHECK_AND_EXIT
# define CUDA_CHECK_AND_EXIT(error) \
{ \
auto status = static_cast<cudaError_t>(error); \
if (status != cudaSuccess) { \
std::cout << cudaGetErrorString(status) << " " << __FILE__ << ":" << __LINE__ << std::endl; \
std::exit(status); \
} \
}
#endif
Loading
Loading