NVIDIA · cole-brower · Jan 20, 2026 · Jan 23, 2026 · Jan 25, 2026 · Jan 26, 2026
diff --git a/events/sca_2026/MathDx_Ozaki-I_Tutorial.pdf b/events/sca_2026/MathDx_Ozaki-I_Tutorial.pdf
diff --git a/tutorials/floating-point-emulation/brev/docker-compose.yml b/tutorials/floating-point-emulation/brev/docker-compose.yml
@@ -0,0 +1,67 @@
+name: &tutorial-name floating-point-emulation
+
+x-config:
+  dockerfile: &dockerfile tutorials/floating-point-emulation/brev/dockerfile
+  image: &image ghcr.io/nvidia/floating-point-emulation-tutorial:latest
+  working-dir: &working-dir /accelerated-computing-hub/tutorials/floating-point-emulation/notebooks
+  large: &large true
+  default-jupyter-url: &default-jupyter-url
+  gpu-config: &gpu-config
+    privileged: true
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    shm_size: 1g
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+  common-service: &common-service
+    pull_policy: missing
+    volumes:
+      - accelerated-computing-hub:/accelerated-computing-hub
+      - /var/run/docker.sock:/var/run/docker.sock
+    environment:
+      BREV_ENV_ID: ${BREV_ENV_ID:-}
+      ACH_TUTORIAL: *tutorial-name
+      ACH_RUN_TESTS: ${ACH_RUN_TESTS:-}
+      ACH_USER: ${ACH_USER:-ach}
+      ACH_UID: ${ACH_UID:-1000}
+      ACH_GID: ${ACH_GID:-1000}
+    user: root
+    working_dir: *working-dir
+  persistent-service: &persistent-service
+    depends_on:
+      base:
+        condition: service_completed_successfully
+    restart: unless-stopped
+
+services:
+  base:
+    <<: [*gpu-config, *common-service]
+    image: *image
+    entrypoint: ["/accelerated-computing-hub/brev/entrypoint.bash", "base"]
+    build:
+      context: ../../..
+      dockerfile: *dockerfile
+    restart: "no"
+  jupyter:
+    <<: [*gpu-config, *common-service, *persistent-service]
+    image: *image
+    entrypoint: ["/accelerated-computing-hub/brev/entrypoint.bash", "jupyter"]
+    command: *default-jupyter-url
+    ports:
+      - "127.0.0.1:8888:8888" # JupyterLab
+  nsight:
+    <<: [*gpu-config, *common-service, *persistent-service]
+    image: nvcr.io/nvidia/devtools/nsight-streamer-nsys:2025.3.1
+    entrypoint: ["/accelerated-computing-hub/brev/entrypoint.bash", "nsight"]
+    ports:
+      - "127.0.0.1:8080:8080" # HTTP
+      - "127.0.0.1:3478:3478" # TURN
+
+volumes:
+  accelerated-computing-hub:
diff --git a/tutorials/floating-point-emulation/brev/dockerfile b/tutorials/floating-point-emulation/brev/dockerfile
@@ -0,0 +1,84 @@
+FROM nvidia/cuda:13.1.0-base-ubuntu24.04
+
+ENV PIP_ROOT_USER_ACTION=ignore \
+    ACH_TUTORIAL=floating-point-emulation \
+    BASH_ENV=/accelerated-computing-hub/brev/user-setup.bash
+
+# Install CUDA Toolkit + build tools
+RUN apt update -y \
+    && apt install -y --no-install-recommends wget curl gnupg gosu lsb-release sudo \
+    && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \
+    && echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ noble main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null \
+    && curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \
+    && apt update -y \
+    && apt install -y cuda-nvrtc-13-1 cuda-cccl-13-1 libcublas-dev-13-1 \
+    libnvjitlink-13-1 cuda-cudart-13-1 cuda-nvcc-13-1 libnvvm-13-1 \
+    python-is-python3 python3-venv \
+    build-essential cmake \
+    git git-lfs \
+    docker-ce \
+    docker-ce-cli \
+    containerd.io \
+    docker-buildx-plugin \
+    docker-compose-plugin \
+    && apt-get clean -y
+
+# Install MathDx
+RUN wget https://developer.nvidia.com/downloads/compute/cublasdx/redist/cublasdx/cuda13/nvidia-mathdx-25.12.1-cuda13.tar.gz \
+    && tar -xvf nvidia-mathdx-25.12.1-cuda13.tar.gz \
+    && rm nvidia-mathdx-25.12.1-cuda13.tar.gz \
+    && mkdir -p /opt/nvidia \
+    && mv nvidia-mathdx-25.12.1-cuda13/nvidia/mathdx /opt/nvidia/mathdx \
+    && rm -rf nvidia-mathdx-25.12.1-cuda13
+
+# Install libmathdx
+RUN wget https://developer.nvidia.com/downloads/compute/cublasdx/redist/cublasdx/cuda13/libmathdx-Linux-x86_64-0.3.1-cuda13.0.tar.gz \
+    && mkdir -p /opt/nvidia/libmathdx \
+    && tar -xvf libmathdx-Linux-x86_64-0.3.1-cuda13.0.tar.gz -C /opt/nvidia/libmathdx \
+    && rm libmathdx-Linux-x86_64-0.3.1-cuda13.0.tar.gz
+
+# Install python
+RUN python -m venv /opt/venv
+ENV CUDA_PATH=/usr/local/cuda-13.1 \
+    PATH="/opt/venv/bin:$PATH" \
+    LD_LIBRARY_PATH="/opt/nvidia/libmathdx/lib:$LD_LIBRARY_PATH"
+
+COPY tutorials/${ACH_TUTORIAL}/brev/requirements.txt /opt/requirements.txt
+
+RUN set -ex \
+ && `# Install Python packages` \
+     && pip install --no-cache-dir -r /opt/requirements.txt \
+     && rm /opt/requirements.txt
+
+RUN set -ex \
+ && `# Setup JupyterLab` \
+     && mkdir -p ~/.jupyter \
+     && ln -fs /accelerated-computing-hub/brev/jupyter-server-config.py ~/.jupyter/jupyter_server_config.py \
+     && mkdir -p ~/.ipython/profile_default/startup \
+     && ln -fs /accelerated-computing-hub/brev/ipython-startup-add-cwd-to-path.py ~/.ipython/profile_default/startup/00-add-cwd-to-path.py \
+     && python -m jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
+
+# Enable passwordless sudo for all users and pass through environment and path
+RUN echo 'ALL ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers \
+ && sed -i -e 's/^Defaults\s*env_reset/#&/' -e 's/^Defaults\s*secure_path=/#&/' /etc/sudoers
+
+COPY . /accelerated-computing-hub
+
+# Ensure accelerated-computing-hub directory is writable by any user and setup shell initialization
+RUN chmod -R a+rwX /accelerated-computing-hub \
+ && mkdir -p /accelerated-computing-hub/logs \
+ && chmod 777 /accelerated-computing-hub/logs \
+ && ln -s /accelerated-computing-hub/brev/user-setup.bash /etc/profile.d/ach-user-setup.sh \
+ && echo 'source /accelerated-computing-hub/brev/user-setup.bash' >> /etc/bash.bashrc
+
+WORKDIR /accelerated-computing-hub/tutorials/${ACH_TUTORIAL}/notebooks
+
+# Setup Git.
+RUN git config --unset-all "http.https://github.com/.extraheader" || { code=$?; [ "$code" = 5 ] || exit "$code"; } \
+ && git config --global --add safe.directory "/accelerated-computing-hub"
+
+# Set default user to ach (can be overriden with docker run --user)
+USER ach
+
+ENTRYPOINT ["/accelerated-computing-hub/brev/entrypoint.bash", "jupyter"]
diff --git a/tutorials/floating-point-emulation/brev/requirements.txt b/tutorials/floating-point-emulation/brev/requirements.txt
@@ -0,0 +1,24 @@
+# NVMATH + CTK 13.1 + CCCL + cublas
+nvmath-python[dx]==0.8.*
+cuda-core
+cuda-bindings==13.1.*
+cuda-cccl
+
+# Scientific
+numpy
+scipy
+ssgetpy
+cupy-cuda13x
+
+# Visualization
+matplotlib
+
+# Jupyter
+jupyterlab
+jupyterlab-nvidia-nsight
+jupyterlab-execute-time
+ipywidgets
+ipykernel
+
+# MPI
+mpi4py
diff --git a/tutorials/floating-point-emulation/brev/test.bash b/tutorials/floating-point-emulation/brev/test.bash
@@ -0,0 +1,3 @@
+#! /bin/bash
+
+nvidia-smi
diff --git a/tutorials/floating-point-emulation/cmake/common.cmake b/tutorials/floating-point-emulation/cmake/common.cmake
@@ -0,0 +1,67 @@
+# Global CXX flags/options
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+enable_testing()
+
+LIST(APPEND CMAKE_PROGRAM_PATH  "/usr/local/cuda-13.1/bin")
+
+# Set default arguments
+set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a")
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
+endif()
+
+# Find cuBLASDx 
+message(CHECK_START "Example Wrapper: Looking for MathDx package")
+find_package(mathdx REQUIRED CONFIG
+ 	    PATHS
+		"/opt/nvidia/mathdx/25.12"
+)
+
+find_package(CUDAToolkit REQUIRED)
+
+if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "")
+	message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a")
+endif()
+
+if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$")
+	message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a")
+endif()
+
+string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}")
+
+set(TUTORIAL_SM          "${CMAKE_MATCH_1}0")
+set(TUTORIAL_SM_LETTER   "${CMAKE_MATCH_2}")  # will be empty if no letter
+
+if(TUTORIAL_SM_LETTER STREQUAL "")
+    # Case: no letter
+    set(TUTORIAL_SM_MODIFIER "cublasdx::generic")
+
+elseif(TUTORIAL_SM_LETTER STREQUAL "a")
+    # Case: letter 'a'
+    set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific")
+
+elseif(TUTORIAL_SM_LETTER STREQUAL "f")
+    # Case: letter 'f'
+    set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific")
+
+else()
+    mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.")
+endif()
+
+set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}")
+
+add_library(helpers INTERFACE)
+target_include_directories(helpers INTERFACE include/)
+
+function(add_tutorial tutorial_name tutorial_file)
+    add_executable("${tutorial_name}" "${tutorial_file}")
+    add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}")
+    target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM})
+    target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER})
+    target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas)
+    target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx)
+    target_link_libraries("${tutorial_name}" PRIVATE helpers)
+    target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr")
+endfunction()
diff --git a/tutorials/floating-point-emulation/cmake/tutorial.cmake b/tutorials/floating-point-emulation/cmake/tutorial.cmake
@@ -0,0 +1,66 @@
+# Global CXX flags/options
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+enable_testing()
+
+# Set default arguments
+set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a")
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
+endif()
+
+# Find cuBLASDx 
+message(CHECK_START "Example Wrapper: Looking for MathDx package")
+find_package(mathdx REQUIRED CONFIG
+ 	    PATHS
+		"/opt/nvidia/mathdx/25.12"
+)
+
+find_package(CUDAToolkit REQUIRED)
+
+if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "")
+	message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a")
+endif()
+
+if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$")
+	message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a")
+endif()
+
+string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}")
+
+set(TUTORIAL_SM          "${CMAKE_MATCH_1}0")
+set(TUTORIAL_SM_LETTER   "${CMAKE_MATCH_2}")  # will be empty if no letter
+
+if(TUTORIAL_SM_LETTER STREQUAL "")
+    # Case: no letter
+    set(TUTORIAL_SM_MODIFIER "cublasdx::generic")
+
+elseif(TUTORIAL_SM_LETTER STREQUAL "a")
+    # Case: letter 'a'
+    set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific")
+
+elseif(TUTORIAL_SM_LETTER STREQUAL "f")
+    # Case: letter 'f'
+    set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific")
+
+else()
+    mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.")
+endif()
+
+set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}")
+
+if(NOT TARGET tutorial_helpers)
+    message( FATAL_ERROR "Please add tutorial_helpers library before including tutorial.cmake" )
+endif()
+
+function(add_tutorial tutorial_name tutorial_file)
+    add_executable("${tutorial_name}" "${tutorial_file}")
+    add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}")
+    target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM})
+    target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER})
+    target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas)
+    target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx)
+    target_link_libraries("${tutorial_name}" PRIVATE tutorial_helpers)
+    target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr")
+endfunction()
diff --git a/tutorials/floating-point-emulation/cpp_source/CMakeLists.txt b/tutorials/floating-point-emulation/cpp_source/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 4.0)
+
+LIST(APPEND CMAKE_PROGRAM_PATH  "/usr/local/cuda-13.1/bin")
+project(cublasdx-dgemm-tutorial VERSION 0.1 LANGUAGES CUDA CXX)
+
+# Add header tutorial helper files
+add_library(tutorial_helpers INTERFACE)
+target_include_directories(tutorial_helpers INTERFACE include/)
+
+include(../cmake/common.cmake)
+
+add_tutorial(1a_simple_dgemm_tensor src/1a_simple_dgemm_tensor.cu)
+add_tutorial(1b_simple_dgemm_shared src/1b_simple_dgemm_shared.cu)
+add_tutorial(1c_simple_dgemm_cublasdx src/1c_simple_dgemm_cublasdx.cu)
+add_tutorial(1d_simple_pipelined_dgemm src/1d_simple_pipelined_dgemm.cu)
+add_tutorial(2a_unfused_emulation src/2a_unfused_emulation/dgemm_emulation.cu)
+add_tutorial(2b_partially_fused_emulation src/2b_partially_fused_emulation/dgemm_emulation.cu)
+add_tutorial(2c_fully_fused_emulation src/2c_fully_fused_emulation/dgemm_emulation.cu)
+add_tutorial(3a_fused_syrk_emulation src/3a_fused_syrk_emulation/syrk_emulation.cu)
+
diff --git a/tutorials/floating-point-emulation/cpp_source/include/cuda_utilities.hpp b/tutorials/floating-point-emulation/cpp_source/include/cuda_utilities.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#ifndef CUDA_CHECK_AND_EXIT
+#    define CUDA_CHECK_AND_EXIT(error)                                                                      \
+        {                                                                                                   \
+            auto status = static_cast<cudaError_t>(error);                                                  \
+            if (status != cudaSuccess) {                                                                    \
+                std::cout << cudaGetErrorString(status) << " " << __FILE__ << ":" << __LINE__ << std::endl; \
+                std::exit(status);                                                                          \
+            }                                                                                               \
+        }
+#endif