diff --git a/events/sca_2026/MathDx_Ozaki-I_Tutorial.pdf b/events/sca_2026/MathDx_Ozaki-I_Tutorial.pdf new file mode 100644 index 00000000..5cacf1d3 Binary files /dev/null and b/events/sca_2026/MathDx_Ozaki-I_Tutorial.pdf differ diff --git a/tutorials/floating-point-emulation/brev/docker-compose.yml b/tutorials/floating-point-emulation/brev/docker-compose.yml new file mode 100644 index 00000000..8574267f --- /dev/null +++ b/tutorials/floating-point-emulation/brev/docker-compose.yml @@ -0,0 +1,67 @@ +name: &tutorial-name floating-point-emulation + +x-config: + dockerfile: &dockerfile tutorials/floating-point-emulation/brev/dockerfile + image: &image ghcr.io/nvidia/floating-point-emulation-tutorial:latest + working-dir: &working-dir /accelerated-computing-hub/tutorials/floating-point-emulation/notebooks + large: &large true + default-jupyter-url: &default-jupyter-url + gpu-config: &gpu-config + privileged: true + ulimits: + memlock: -1 + stack: 67108864 + shm_size: 1g + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + common-service: &common-service + pull_policy: missing + volumes: + - accelerated-computing-hub:/accelerated-computing-hub + - /var/run/docker.sock:/var/run/docker.sock + environment: + BREV_ENV_ID: ${BREV_ENV_ID:-} + ACH_TUTORIAL: *tutorial-name + ACH_RUN_TESTS: ${ACH_RUN_TESTS:-} + ACH_USER: ${ACH_USER:-ach} + ACH_UID: ${ACH_UID:-1000} + ACH_GID: ${ACH_GID:-1000} + user: root + working_dir: *working-dir + persistent-service: &persistent-service + depends_on: + base: + condition: service_completed_successfully + restart: unless-stopped + +services: + base: + <<: [*gpu-config, *common-service] + image: *image + entrypoint: ["/accelerated-computing-hub/brev/entrypoint.bash", "base"] + build: + context: ../../.. + dockerfile: *dockerfile + restart: "no" + jupyter: + <<: [*gpu-config, *common-service, *persistent-service] + image: *image + entrypoint: ["/accelerated-computing-hub/brev/entrypoint.bash", "jupyter"] + command: *default-jupyter-url + ports: + - "127.0.0.1:8888:8888" # JupyterLab + nsight: + <<: [*gpu-config, *common-service, *persistent-service] + image: nvcr.io/nvidia/devtools/nsight-streamer-nsys:2025.3.1 + entrypoint: ["/accelerated-computing-hub/brev/entrypoint.bash", "nsight"] + ports: + - "127.0.0.1:8080:8080" # HTTP + - "127.0.0.1:3478:3478" # TURN + +volumes: + accelerated-computing-hub: diff --git a/tutorials/floating-point-emulation/brev/dockerfile b/tutorials/floating-point-emulation/brev/dockerfile new file mode 100644 index 00000000..96a0af56 --- /dev/null +++ b/tutorials/floating-point-emulation/brev/dockerfile @@ -0,0 +1,84 @@ +FROM nvidia/cuda:13.1.0-base-ubuntu24.04 + +ENV PIP_ROOT_USER_ACTION=ignore \ + ACH_TUTORIAL=floating-point-emulation \ + BASH_ENV=/accelerated-computing-hub/brev/user-setup.bash + +# Install CUDA Toolkit + build tools +RUN apt update -y \ + && apt install -y --no-install-recommends wget curl gnupg gosu lsb-release sudo \ + && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ + && echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ noble main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ + && curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \ + && apt update -y \ + && apt install -y cuda-nvrtc-13-1 cuda-cccl-13-1 libcublas-dev-13-1 \ + libnvjitlink-13-1 cuda-cudart-13-1 cuda-nvcc-13-1 libnvvm-13-1 \ + python-is-python3 python3-venv \ + build-essential cmake \ + git git-lfs \ + docker-ce \ + docker-ce-cli \ + containerd.io \ + docker-buildx-plugin \ + docker-compose-plugin \ + && apt-get clean -y + +# Install MathDx +RUN wget https://developer.nvidia.com/downloads/compute/cublasdx/redist/cublasdx/cuda13/nvidia-mathdx-25.12.1-cuda13.tar.gz \ + && tar -xvf nvidia-mathdx-25.12.1-cuda13.tar.gz \ + && rm nvidia-mathdx-25.12.1-cuda13.tar.gz \ + && mkdir -p /opt/nvidia \ + && mv nvidia-mathdx-25.12.1-cuda13/nvidia/mathdx /opt/nvidia/mathdx \ + && rm -rf nvidia-mathdx-25.12.1-cuda13 + +# Install libmathdx +RUN wget https://developer.nvidia.com/downloads/compute/cublasdx/redist/cublasdx/cuda13/libmathdx-Linux-x86_64-0.3.1-cuda13.0.tar.gz \ + && mkdir -p /opt/nvidia/libmathdx \ + && tar -xvf libmathdx-Linux-x86_64-0.3.1-cuda13.0.tar.gz -C /opt/nvidia/libmathdx \ + && rm libmathdx-Linux-x86_64-0.3.1-cuda13.0.tar.gz + +# Install python +RUN python -m venv /opt/venv +ENV CUDA_PATH=/usr/local/cuda-13.1 \ + PATH="/opt/venv/bin:$PATH" \ + LD_LIBRARY_PATH="/opt/nvidia/libmathdx/lib:$LD_LIBRARY_PATH" + +COPY tutorials/${ACH_TUTORIAL}/brev/requirements.txt /opt/requirements.txt + +RUN set -ex \ + && `# Install Python packages` \ + && pip install --no-cache-dir -r /opt/requirements.txt \ + && rm /opt/requirements.txt + +RUN set -ex \ + && `# Setup JupyterLab` \ + && mkdir -p ~/.jupyter \ + && ln -fs /accelerated-computing-hub/brev/jupyter-server-config.py ~/.jupyter/jupyter_server_config.py \ + && mkdir -p ~/.ipython/profile_default/startup \ + && ln -fs /accelerated-computing-hub/brev/ipython-startup-add-cwd-to-path.py ~/.ipython/profile_default/startup/00-add-cwd-to-path.py \ + && python -m jupyter labextension disable "@jupyterlab/apputils-extension:announcements" + +# Enable passwordless sudo for all users and pass through environment and path +RUN echo 'ALL ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers \ + && sed -i -e 's/^Defaults\s*env_reset/#&/' -e 's/^Defaults\s*secure_path=/#&/' /etc/sudoers + +COPY . /accelerated-computing-hub + +# Ensure accelerated-computing-hub directory is writable by any user and setup shell initialization +RUN chmod -R a+rwX /accelerated-computing-hub \ + && mkdir -p /accelerated-computing-hub/logs \ + && chmod 777 /accelerated-computing-hub/logs \ + && ln -s /accelerated-computing-hub/brev/user-setup.bash /etc/profile.d/ach-user-setup.sh \ + && echo 'source /accelerated-computing-hub/brev/user-setup.bash' >> /etc/bash.bashrc + +WORKDIR /accelerated-computing-hub/tutorials/${ACH_TUTORIAL}/notebooks + +# Setup Git. +RUN git config --unset-all "http.https://github.com/.extraheader" || { code=$?; [ "$code" = 5 ] || exit "$code"; } \ + && git config --global --add safe.directory "/accelerated-computing-hub" + +# Set default user to ach (can be overriden with docker run --user) +USER ach + +ENTRYPOINT ["/accelerated-computing-hub/brev/entrypoint.bash", "jupyter"] diff --git a/tutorials/floating-point-emulation/brev/requirements.txt b/tutorials/floating-point-emulation/brev/requirements.txt new file mode 100644 index 00000000..cc6009f5 --- /dev/null +++ b/tutorials/floating-point-emulation/brev/requirements.txt @@ -0,0 +1,24 @@ +# NVMATH + CTK 13.1 + CCCL + cublas +nvmath-python[dx]==0.8.* +cuda-core +cuda-bindings==13.1.* +cuda-cccl + +# Scientific +numpy +scipy +ssgetpy +cupy-cuda13x + +# Visualization +matplotlib + +# Jupyter +jupyterlab +jupyterlab-nvidia-nsight +jupyterlab-execute-time +ipywidgets +ipykernel + +# MPI +mpi4py diff --git a/tutorials/floating-point-emulation/brev/test.bash b/tutorials/floating-point-emulation/brev/test.bash new file mode 100755 index 00000000..715d07c0 --- /dev/null +++ b/tutorials/floating-point-emulation/brev/test.bash @@ -0,0 +1,3 @@ +#! /bin/bash + +nvidia-smi diff --git a/tutorials/floating-point-emulation/cmake/common.cmake b/tutorials/floating-point-emulation/cmake/common.cmake new file mode 100644 index 00000000..24a628e3 --- /dev/null +++ b/tutorials/floating-point-emulation/cmake/common.cmake @@ -0,0 +1,67 @@ +# Global CXX flags/options +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +enable_testing() + +LIST(APPEND CMAKE_PROGRAM_PATH "/usr/local/cuda-13.1/bin") + +# Set default arguments +set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a") +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE) +endif() + +# Find cuBLASDx +message(CHECK_START "Example Wrapper: Looking for MathDx package") +find_package(mathdx REQUIRED CONFIG + PATHS + "/opt/nvidia/mathdx/25.12" +) + +find_package(CUDAToolkit REQUIRED) + +if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "") + message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a") +endif() + +if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$") + message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a") +endif() + +string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}") + +set(TUTORIAL_SM "${CMAKE_MATCH_1}0") +set(TUTORIAL_SM_LETTER "${CMAKE_MATCH_2}") # will be empty if no letter + +if(TUTORIAL_SM_LETTER STREQUAL "") + # Case: no letter + set(TUTORIAL_SM_MODIFIER "cublasdx::generic") + +elseif(TUTORIAL_SM_LETTER STREQUAL "a") + # Case: letter 'a' + set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific") + +elseif(TUTORIAL_SM_LETTER STREQUAL "f") + # Case: letter 'f' + set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific") + +else() + mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.") +endif() + +set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}") + +add_library(helpers INTERFACE) +target_include_directories(helpers INTERFACE include/) + +function(add_tutorial tutorial_name tutorial_file) + add_executable("${tutorial_name}" "${tutorial_file}") + add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}") + target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM}) + target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER}) + target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas) + target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx) + target_link_libraries("${tutorial_name}" PRIVATE helpers) + target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr") +endfunction() diff --git a/tutorials/floating-point-emulation/cmake/tutorial.cmake b/tutorials/floating-point-emulation/cmake/tutorial.cmake new file mode 100644 index 00000000..7993a770 --- /dev/null +++ b/tutorials/floating-point-emulation/cmake/tutorial.cmake @@ -0,0 +1,66 @@ +# Global CXX flags/options +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +enable_testing() + +# Set default arguments +set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a") +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE) +endif() + +# Find cuBLASDx +message(CHECK_START "Example Wrapper: Looking for MathDx package") +find_package(mathdx REQUIRED CONFIG + PATHS + "/opt/nvidia/mathdx/25.12" +) + +find_package(CUDAToolkit REQUIRED) + +if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "") + message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a") +endif() + +if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$") + message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a") +endif() + +string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}") + +set(TUTORIAL_SM "${CMAKE_MATCH_1}0") +set(TUTORIAL_SM_LETTER "${CMAKE_MATCH_2}") # will be empty if no letter + +if(TUTORIAL_SM_LETTER STREQUAL "") + # Case: no letter + set(TUTORIAL_SM_MODIFIER "cublasdx::generic") + +elseif(TUTORIAL_SM_LETTER STREQUAL "a") + # Case: letter 'a' + set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific") + +elseif(TUTORIAL_SM_LETTER STREQUAL "f") + # Case: letter 'f' + set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific") + +else() + mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.") +endif() + +set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}") + +if(NOT TARGET tutorial_helpers) + message( FATAL_ERROR "Please add tutorial_helpers library before including tutorial.cmake" ) +endif() + +function(add_tutorial tutorial_name tutorial_file) + add_executable("${tutorial_name}" "${tutorial_file}") + add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}") + target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM}) + target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER}) + target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas) + target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx) + target_link_libraries("${tutorial_name}" PRIVATE tutorial_helpers) + target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr") +endfunction() diff --git a/tutorials/floating-point-emulation/cpp_source/CMakeLists.txt b/tutorials/floating-point-emulation/cpp_source/CMakeLists.txt new file mode 100644 index 00000000..d7b87122 --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 4.0) + +LIST(APPEND CMAKE_PROGRAM_PATH "/usr/local/cuda-13.1/bin") +project(cublasdx-dgemm-tutorial VERSION 0.1 LANGUAGES CUDA CXX) + +# Add header tutorial helper files +add_library(tutorial_helpers INTERFACE) +target_include_directories(tutorial_helpers INTERFACE include/) + +include(../cmake/common.cmake) + +add_tutorial(1a_simple_dgemm_tensor src/1a_simple_dgemm_tensor.cu) +add_tutorial(1b_simple_dgemm_shared src/1b_simple_dgemm_shared.cu) +add_tutorial(1c_simple_dgemm_cublasdx src/1c_simple_dgemm_cublasdx.cu) +add_tutorial(1d_simple_pipelined_dgemm src/1d_simple_pipelined_dgemm.cu) +add_tutorial(2a_unfused_emulation src/2a_unfused_emulation/dgemm_emulation.cu) +add_tutorial(2b_partially_fused_emulation src/2b_partially_fused_emulation/dgemm_emulation.cu) +add_tutorial(2c_fully_fused_emulation src/2c_fully_fused_emulation/dgemm_emulation.cu) +add_tutorial(3a_fused_syrk_emulation src/3a_fused_syrk_emulation/syrk_emulation.cu) + diff --git a/tutorials/floating-point-emulation/cpp_source/include/cuda_utilities.hpp b/tutorials/floating-point-emulation/cpp_source/include/cuda_utilities.hpp new file mode 100644 index 00000000..0af19b71 --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/cuda_utilities.hpp @@ -0,0 +1,12 @@ +#pragma once + +#ifndef CUDA_CHECK_AND_EXIT +# define CUDA_CHECK_AND_EXIT(error) \ + { \ + auto status = static_cast(error); \ + if (status != cudaSuccess) { \ + std::cout << cudaGetErrorString(status) << " " << __FILE__ << ":" << __LINE__ << std::endl; \ + std::exit(status); \ + } \ + } +#endif diff --git a/tutorials/floating-point-emulation/cpp_source/include/numerical.hpp b/tutorials/floating-point-emulation/cpp_source/include/numerical.hpp new file mode 100644 index 00000000..ca3a0c7a --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/numerical.hpp @@ -0,0 +1,81 @@ +#pragma once + +namespace tutorial { + + enum class matrix_half + { + lower, + upper + }; + + namespace detail { + template + struct is_complex_helper { + static constexpr bool value = false; + }; + + template + struct is_complex_helper> { + static constexpr bool value = true; + }; + + template + struct is_complex_helper> { + static constexpr bool value = true; + }; + + template + struct is_complex_helper> { + static constexpr bool value = true; + }; + } // namespace detail + + template + CUBLASDX_HOST_DEVICE constexpr bool is_complex() { + return detail::is_complex_helper::value; + } + + namespace detail { + template + double cbabs(T v) { + if constexpr (is_complex()) { + auto imag = std::abs(static_cast(v.imag())); + auto real = std::abs(static_cast(v.real())); + return (real + imag) / 2.0; + } else { + return std::abs(static_cast(v)); + } + } + } // namespace detail + + template + __host__ __device__ __forceinline__ constexpr T1 convert(T2 v) { + constexpr bool is_output_complex = cublasdx::detail::has_complex_interface_v; + constexpr bool is_input_complex = cublasdx::detail::has_complex_interface_v; + if constexpr (is_input_complex and is_output_complex) { + using t1_vt = typename T1::value_type; + return T1(convert(v.real()), convert(v.imag())); + } else if constexpr (is_output_complex) { + using t1_vt = typename T1::value_type; + return T1(convert(v), convert(v)); + } else if constexpr (is_input_complex) { + return convert(v.real()); + } else if constexpr (COMMONDX_STL_NAMESPACE::is_convertible_v) { + return static_cast(v); + } else if constexpr (COMMONDX_STL_NAMESPACE::is_constructible_v) { + return T1(v); + } else { + static_assert(COMMONDX_STL_NAMESPACE::is_convertible_v, + "Please provide your own conversion function"); + } + } + + template + struct converter { + template + CUBLASDX_HOST_DEVICE constexpr T operator()(V const& v) const { + return convert(v); + } + }; + +} // namespace tutorial diff --git a/tutorials/floating-point-emulation/cpp_source/include/performance_measurement.hpp b/tutorials/floating-point-emulation/cpp_source/include/performance_measurement.hpp new file mode 100644 index 00000000..65c1afee --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/performance_measurement.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include "cuda_utilities.hpp" + +namespace tutorial { + + double real_gemm_tflops(unsigned m, unsigned n, unsigned k) { + return (2. * m * n * k) / 1e9; + } + + double real_syrk_tflops(unsigned n, unsigned k) { + double syrk_to_gemm_flop_ratio = ((n * (n + 1)) / 2.0) / static_cast(n * n); + return real_gemm_tflops(n, n, k) * syrk_to_gemm_flop_ratio; + } + + struct measure { + // Returns execution time in ms. + template + static float execution(Kernel&& kernel, + const unsigned int warm_up_runs, + const unsigned int runs, + cudaStream_t stream) { + cudaEvent_t startEvent, stopEvent; + CUDA_CHECK_AND_EXIT(cudaEventCreate(&startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventCreate(&stopEvent)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + for (unsigned int i = 0; i < warm_up_runs; i++) { + kernel(stream); + } + + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + CUDA_CHECK_AND_EXIT(cudaEventRecord(startEvent, stream)); + for (unsigned int i = 0; i < runs; i++) { + kernel(stream); + } + CUDA_CHECK_AND_EXIT(cudaEventRecord(stopEvent, stream)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + float time; + CUDA_CHECK_AND_EXIT(cudaEventElapsedTime(&time, startEvent, stopEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(stopEvent)); + return time / runs; + } + }; + +} // namespace tutorial diff --git a/tutorials/floating-point-emulation/cpp_source/include/performance_structures.hpp b/tutorials/floating-point-emulation/cpp_source/include/performance_structures.hpp new file mode 100644 index 00000000..f38869b3 --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/performance_structures.hpp @@ -0,0 +1,71 @@ +#pragma once + +namespace tutorial { + + typedef enum { + EMU_MATMUL_TILE_128x128x128 = 0, + EMU_MATMUL_TILE_256x128x128 = 1, + EMU_MATMUL_TILE_128x256x128 = 2, + EMU_MATMUL_TILE_128x128x256 = 3, + } emuMatmulTile_t; + + typedef enum { + EMU_MATMUL_STAGES_1 = 0, + EMU_MATMUL_STAGES_2 = 1, + EMU_MATMUL_STAGES_4 = 2, + EMU_MATMUL_STAGES_8 = 3, + } emuMatmulStages_t; + + typedef enum { + EMU_BLOCK_DIM_64x1x1 = 0, + EMU_BLOCK_DIM_128x1x1 = 1, + EMU_BLOCK_DIM_256x1x1 = 2 + } emuMatmulBlockDim_t; + + +template +struct EmuTileInfo; + +#define MAKE_EMU_TILE_INFO(TILE_M_, TILE_N_, TILE_K_) \ + template <> \ + struct EmuTileInfo { \ + static constexpr int TILE_M = TILE_M_; \ + static constexpr int TILE_N = TILE_N_; \ + static constexpr int TILE_K = TILE_K_; \ + } + +MAKE_EMU_TILE_INFO(128, 128, 128); +MAKE_EMU_TILE_INFO(256, 128, 128); +MAKE_EMU_TILE_INFO(128, 256, 128); +MAKE_EMU_TILE_INFO(128, 128, 256); + +template +struct EmuStageInfo; + +#define MAKE_EMU_STAGE_INFO(STAGE_COUNT_) \ + template <> \ + struct EmuStageInfo { \ + static constexpr int STAGE_COUNT = STAGE_COUNT_; \ + } + +MAKE_EMU_STAGE_INFO(1); +MAKE_EMU_STAGE_INFO(2); +MAKE_EMU_STAGE_INFO(4); +MAKE_EMU_STAGE_INFO(8); + +template +struct EmuBlockDimInfo; + +#define MAKE_EMU_BLOCK_DIM_INFO(BLOCK_X_, BLOCK_Y_, BLOCK_Z_) \ + template <> \ + struct EmuBlockDimInfo { \ + static constexpr int BLOCK_X = BLOCK_X_; \ + static constexpr int BLOCK_Y = BLOCK_Y_; \ + static constexpr int BLOCK_Z = BLOCK_Z_; \ + } + +MAKE_EMU_BLOCK_DIM_INFO(64, 1, 1); +MAKE_EMU_BLOCK_DIM_INFO(128, 1, 1); +MAKE_EMU_BLOCK_DIM_INFO(256, 1, 1); + +} diff --git a/tutorials/floating-point-emulation/cpp_source/include/random.hpp b/tutorials/floating-point-emulation/cpp_source/include/random.hpp new file mode 100644 index 00000000..c27f19ab --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/random.hpp @@ -0,0 +1,154 @@ +#pragma once + +#include "numerical.hpp" + +namespace tutorial { + + enum class random_distribution + { + any, + uniform, + normal + }; + + template + std::vector get_random_vector(Processor const& proc, Dist& dist, const size_t size, int seed = -1) { + + std::vector ret(size); + + std::generate(ret.begin(), ret.end(), [&]() { + static thread_local std::random_device rd; + static thread_local std::ranlux24_base gen((seed != -1) ? seed : rd()); + + return convert(proc(dist(gen))); + }); + + return ret; + } + + template + struct random_generator { + template + static std::vector generate(size_t size, OptionalDistArgs... dist_args); + }; + + template + struct random_generator { + static std::vector generate(size_t size, float mean, float sd, const int seed = -1) { + static_assert(commondx::is_floating_point_v, "Floating point output type required"); + auto dist = std::normal_distribution(mean, sd); + auto proc = cublasdx::identity {}; + return get_random_vector(proc, dist, size, seed); + } + }; + + template + struct random_generator { + template + static std::vector generate(size_t size, MinMaxType min, MinMaxType max, int seed = -1) { + static_assert(commondx::is_floating_point_v or commondx::is_integral_v, + "Datatype must be either recognized floating point or integral"); + auto dist = [&]() { + if constexpr (commondx::is_floating_point_v) { + return std::uniform_real_distribution(min, max); + } else { + return std::uniform_int_distribution(min, max); + } + CUTE_GCC_UNREACHABLE; + }(); + auto proc = cublasdx::identity {}; + return get_random_vector(proc, dist, size, seed); + } + }; + + template + struct random_generator { + static std::vector generate(size_t size, int seed = -1) { + if constexpr (commondx::is_floating_point_v) { + return random_generator::generate(size, 0.0, 1.0, seed); + } else if constexpr (commondx::is_signed_integral_v) { + return random_generator::generate(size, -20, 20, seed); + } else if constexpr (commondx::is_unsigned_integral_v) { + return random_generator::generate(size, 0, 40, seed); + } else { + static_assert(commondx::is_floating_point_v or commondx::is_integral_v); + } + CUTE_GCC_UNREACHABLE; + } + }; + + template + auto get_random_device_tensor(int size_x, int size_y, OptionalDistArgs... optional_dist_args) { + std::vector random_host_data = + random_generator::generate(size_x * size_y, optional_dist_args...); + + thrust::device_vector device_vector = random_host_data; + auto iter = cute::make_gmem_ptr(thrust::raw_pointer_cast(device_vector.data())); + auto shape = cute::make_shape(size_x, size_y); + auto stride_atom = cute::conditional_return( + cute::LayoutLeft {}, cute::LayoutRight {}); + auto tensor = cute::make_tensor(iter, shape, stride_atom); + + return cuda::std::make_tuple(std::move(device_vector), tensor); + } + + template + __global__ void make_tensor_symmetrix(Tensor tensor, matrix_half data_half) { + auto tid_m = threadIdx.x + blockIdx.x * blockDim.x; + auto tid_n = threadIdx.y + blockIdx.y * blockDim.y; + + if (tid_n > tid_m) + return; + } + + template + auto get_symmetric_random_device_tensor(int side_length, + matrix_half data_half, + OptionalDistArgs... optional_dist_args) { + int num_unique_elems = (side_length * (side_length + 1)) / 2; + std::vector random_host_data = + random_generator::generate(num_unique_elems, optional_dist_args...); + + std::vector symm_host_data(side_length * side_length); + + auto get_linear_index = [&](int x, int y) { + bool is_reversed = + (data_half == matrix_half::lower and y > x) or (data_half == matrix_half::upper and x > y); + + int actual_x = is_reversed ? x : y; + int actual_y = is_reversed ? y : x; + + auto const k_lower = ((actual_x - 1) * actual_x) / 2 + actual_y; + auto const k_upper = ((2 * side_length - actual_x) * (actual_x + 1)) / 2 - (side_length - actual_y); + + return (data_half == matrix_half::lower) ? k_lower : k_upper; + }; + + auto shape = cute::make_shape(side_length, side_length); + auto stride_atom = cute::conditional_return( + cute::LayoutLeft {}, cute::LayoutRight {}); + auto layout = cute::make_layout(shape, stride_atom); + + auto host_tensor = cute::make_tensor(symm_host_data.data(), layout); + + for (int i = 0; i < side_length; ++i) { + for (int j = 0; j < side_length; ++j) { + auto data_idx = get_linear_index(i, j); + host_tensor(i, j) = random_host_data.at(data_idx); + } + } + + thrust::device_vector device_vector = symm_host_data; + auto iter = cute::make_gmem_ptr(thrust::raw_pointer_cast(device_vector.data())); + auto tensor = cute::make_tensor(iter, layout); + + return cuda::std::make_tuple(std::move(device_vector), tensor); + } + +} // namespace tutorial diff --git a/tutorials/floating-point-emulation/cpp_source/include/reference/check_error.cu b/tutorials/floating-point-emulation/cpp_source/include/reference/check_error.cu new file mode 100644 index 00000000..622f57a4 --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/reference/check_error.cu @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "check_error.hpp" + +#include +#include + +namespace example {} // namespace example diff --git a/tutorials/floating-point-emulation/cpp_source/include/reference/check_error.hpp b/tutorials/floating-point-emulation/cpp_source/include/reference/check_error.hpp new file mode 100644 index 00000000..3674476f --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/reference/check_error.hpp @@ -0,0 +1,276 @@ +#pragma once + +#include +#include "../tutorial_helpers.hpp" + +namespace tutorial { + namespace detail { + template + struct promote; + + template + struct promote and not is_complex()>> { + using value_type = int64_t; + }; + + template + struct promote and not is_complex()>> { + using value_type = uint64_t; + }; + + template + struct promote and not is_complex()>> { + using value_type = double; + }; + + template class Complex> + struct promote, std::enable_if_t>()>> { + using promoted_internal = typename promote::value_type; + + using value_type = cublasdx::complex; + }; + + template + using get_reference_value_type_t = typename promote::value_type; + } // namespace detail + + template + bool is_error_acceptable(double tot_rel_err) { + if (!std::isfinite(tot_rel_err)) { + return false; + } + + constexpr bool is_fp8_a_b_c = + (commondx::is_floating_point_v and not is_complex() and sizeof(TA) == 1) || + (commondx::is_floating_point_v and not is_complex() and sizeof(TB) == 1) || + (commondx::is_floating_point_v and not is_complex() and sizeof(TC) == 1); + + constexpr bool is_fp8_a_b_c_complex = + (commondx::is_floating_point_v and is_complex() and sizeof(TA) == 2) || + (commondx::is_floating_point_v and is_complex() and sizeof(TB) == 2) || + (commondx::is_floating_point_v and is_complex() and sizeof(TC) == 2); + + constexpr bool is_bf16_a_b_c = + std::is_same_v || std::is_same_v || std::is_same_v; + + constexpr bool is_bf16_a_b_c_complex = std::is_same_v> || + std::is_same_v> || + std::is_same_v>; + + constexpr bool is_integral = + commondx::is_integral_v and commondx::is_integral_v and commondx::is_integral_v; + + constexpr bool is_non_float_non_double_a_b_c = + (!std::is_same_v && !std::is_same_v) || + (!std::is_same_v && !std::is_same_v) || + (!std::is_same_v && !std::is_same_v) || + (!std::is_same_v> && !std::is_same_v>) || + (!std::is_same_v> && !std::is_same_v>) || + (!std::is_same_v> && !std::is_same_v>); + + if constexpr (is_integral) { + if (tot_rel_err != 0.0) { + std::cout << tot_rel_err << std::endl; + return false; + } + } else if (is_fp8_a_b_c) { + if (tot_rel_err > 7e-2) { + std::cout << tot_rel_err << std::endl; + return false; + } + } else if (is_fp8_a_b_c_complex) { + if (tot_rel_err > 1e-1) { + std::cout << tot_rel_err << std::endl; + return false; + } + } else if (is_bf16_a_b_c_complex) { + if (tot_rel_err > 6e-2) { + std::cout << tot_rel_err << std::endl; + return false; + } + } else if (is_bf16_a_b_c) { + if (tot_rel_err > 5e-2) { + std::cout << tot_rel_err << std::endl; + return false; + } + } else if (is_non_float_non_double_a_b_c) { + if (tot_rel_err > 1e-2) { + std::cout << tot_rel_err << std::endl; + return false; + } + } else { // A,B,C are either float or double + if (tot_rel_err > 1e-3) { + std::cout << tot_rel_err << std::endl; + return false; + } + } + return true; + } + + template + constexpr bool is_reference_type() { + return std::is_same_v>; + } + + template + std::enable_if_t() or not std::is_same_v, double> calculate_error( + const std::vector& data, + const std::vector& reference, + bool verbose = false, + bool print = false) { + using ref_t = detail::get_reference_value_type_t; + std::vector input_upcasted; + std::transform(std::cbegin(data), std::cend(data), std::back_inserter(input_upcasted), converter {}); + + // if only the input data required conversion, run comparison + if constexpr (is_reference_type()) { + return calculate_error(input_upcasted, reference, verbose, print); + } + // else, if the reference was also calculated in lower precision, + // also upcast it and only then run the comparison + else { + std::vector reference_upcasted; + std::transform(std::cbegin(reference), + std::cend(reference), + std::back_inserter(reference_upcasted), + converter {}); + return calculate_error(input_upcasted, reference_upcasted, verbose, print); + } + } + + template + bool check_error_custom(const std::vector& results, + const std::vector& reference, + bool verbose = false, + bool print = false) { + [[maybe_unused]] constexpr bool is_floating = commondx::is_floating_point_v; + [[maybe_unused]] constexpr bool is_integral = commondx::is_integral_v; + + auto ret = false; + + if constexpr (is_floating) { + double error = calculate_error(results, reference, verbose, print); + ret = is_error_acceptable(error); + } else if constexpr (is_integral) { + // If the input was integral, then we want absolute equality + if (print) { + std::cout << "Ref\tRes\n"; + } + ret = std::equal(reference.cbegin(), reference.cend(), results.cbegin(), [print](auto ref, auto res) { + if (print) { + if constexpr (is_complex()) { + std::cout << ref.real() << "," << ref.imag() << "\t" << res.real() << "," << res.imag() << "\n"; + } else { + std::cout << ref << "\t" << res << "\n"; + } + } + if constexpr (is_complex()) { + return ref.real() == res.real() and ref.imag() == res.imag(); + } else { + return ref == res; + } + }); + } else { + static_assert(is_floating or is_integral, + "Reference and result must either both be integral or floating point."); + } + + return ret; + } + template + bool check_error(const std::vector& results, + const std::vector& reference, + bool verbose = false, + bool print = false) { + using a_prec_t = typename cublasdx::precision_of::a_type; + using b_prec_t = typename cublasdx::precision_of::b_type; + using c_prec_t = typename cublasdx::precision_of::c_type; + return check_error_custom(results, reference, verbose, print); + } + + template + bool check_error(const std::vector& results, + const std::vector& reference, + bool verbose = false, + bool print = false) { + return check_error_custom(results, reference, verbose, print); + } + + template + double calculate_error(const std::vector& data, const std::vector& reference, bool verbose, bool print) { + using std::abs; + using std::sqrt; + + // Use either double or complex for error computation + using value_type = cute::remove_cvref_t; + using error_type = std::conditional_t(), cublasdx::complex, double>; + + if (print && verbose) { + printf("Idx:\tVal\tRefVal\tRelError\n"); + } + + double eps = 1e-200; + + double tot_error_sq = 0; + double tot_norm_sq = 0; + double tot_res_norm_sq = 0; + double tot_ind_rel_err = 0; + double max_ind_rel_err = 0; + double max_ind_abs_err = 0; + for (std::size_t i = 0; i < data.size(); ++i) { + error_type val = convert(data[i]); + error_type ref = convert(reference[i]); + + double aref = detail::cbabs(ref); + double aval = detail::cbabs(val); + double diff = std::abs(aref - aval); + + double rel_error = diff / (aref + eps); + + // Individual relative error + tot_ind_rel_err += rel_error; + + // Maximum relative error + max_ind_rel_err = std::max(max_ind_rel_err, rel_error); + max_ind_abs_err = std::max(max_ind_abs_err, diff); + + // Total relative error + tot_error_sq += diff * diff; + tot_norm_sq += aref * aref; + + const double inc = detail::cbabs(val) * detail::cbabs(val); + tot_res_norm_sq += inc; + + if ((print && verbose) and (detail::cbabs(diff) > 0.01)) { + if constexpr (is_complex()) { + std::cout << i << ":\t" << '<' << val.real() << ',' << val.imag() << '>' << "\t" << '<' + << ref.real() << ',' << ref.imag() << '>' << "\t" << rel_error << "\n"; + } else { + std::cout << i << ":\t" << val << "\t" << ref << "\t" << rel_error << "\n"; + } + } + } + + if (print) + printf("Vector reference norm: [%.5e]\n", sqrt(tot_norm_sq)); + + if (print) + printf("Vector result norm: [%.5e]\n", sqrt(tot_res_norm_sq)); + + double tot_rel_err = sqrt(tot_error_sq / (tot_norm_sq + eps)); + if (print) + printf("Vector relative error: [%.5e]\n", tot_rel_err); + + double ave_rel_err = tot_ind_rel_err / double(data.size()); + if (print) + printf("Average relative error: [%.5e]\n", ave_rel_err); + + if (print) + printf("Maximum relative error: [%.5e]\n", max_ind_rel_err); + + if (print) + printf("Maximum absolute error: [%.5e]\n", max_ind_abs_err); + + return tot_rel_err; + } +} // namespace tutorial diff --git a/tutorials/floating-point-emulation/cpp_source/include/reference/cublaslt_runner.hpp b/tutorials/floating-point-emulation/cpp_source/include/reference/cublaslt_runner.hpp new file mode 100644 index 00000000..c04c1ad4 --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/reference/cublaslt_runner.hpp @@ -0,0 +1,436 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CUBLASDX_EXAMPLE_CUBLASLT_RUNNER_HPP +#define CUBLASDX_EXAMPLE_CUBLASLT_RUNNER_HPP + +#include "../cuda_utilities.hpp" +#include "../tutorial_helpers.hpp" + +#include +#include + +#ifndef CUBLAS_CHECK_AND_EXIT +# define CUBLAS_CHECK_AND_EXIT(error) \ + { \ + auto status = static_cast(error); \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + if (status == CUBLAS_STATUS_NOT_SUPPORTED) { \ + std::cout << "Config not supported by cuBLASLt, " \ + << "please consult https://docs.nvidia.com/cuda/cublas/#id81 " \ + << "for more detail on supported reference configurations" << std::endl; \ + } \ + std::cout << status << " " << __FILE__ << ":" << __LINE__ << std::endl; \ + std::exit(status); \ + } \ + } +#endif // CUBLAS_CHECK_AND_EXIT + + +namespace example { + + constexpr cublasOperation_t get_cublas_transpose_mode(cublasdx::transpose_mode tmode) { + if (tmode == cublasdx::transpose_mode::non_transposed) { + return CUBLAS_OP_N; + } else if (tmode == cublasdx::transpose_mode::transposed) { + return CUBLAS_OP_T; + } + return CUBLAS_OP_C; + } + + constexpr cublasOperation_t get_cublas_transpose_mode(cublasdx::arrangement arr) { + if (arr == cublasdx::col_major) { + return CUBLAS_OP_N; + } + return CUBLAS_OP_T; + } + + constexpr cublasLtOrder_t get_cublas_layout_order(cublasdx::arrangement arr) { + return (arr == cublasdx::col_major) ? CUBLASLT_ORDER_COL : CUBLASLT_ORDER_ROW; + } + + template + constexpr cudaDataType_t get_cublas_data_type() { + if constexpr (cute::is_same_v>) { + return CUDA_C_64F; + } else if constexpr (cute::is_same_v>) { + return CUDA_C_32F; + } else if constexpr (cute::is_same_v>) { + return CUDA_C_16F; + } else if constexpr (cute::is_same_v>) { + return CUDA_C_16BF; + } else if constexpr (cute::is_same_v>) { + return CUDA_C_8I; + } else if constexpr (cute::is_same_v>) { + return CUDA_C_32I; + } else if constexpr (cute::is_same_v) { + return CUDA_R_64F; + } else if constexpr (cute::is_same_v) { + return CUDA_R_32F; + } else if constexpr (cute::is_same_v) { + return CUDA_R_32F; + } else if constexpr (cute::is_same_v) { + return CUDA_R_16F; + } else if constexpr (cute::is_same_v) { + return CUDA_R_16BF; +#if CUBLASDX_EXAMPLE_SUPPORTS_FP8 + } else if constexpr (cute::is_same_v) { + return CUDA_R_8F_E5M2; + } else if constexpr (cute::is_same_v) { + return CUDA_R_8F_E4M3; +#endif + } else if constexpr (cute::is_same_v) { + return CUDA_R_8I; + } else if constexpr (cute::is_same_v) { + return CUDA_R_8U; + } else if constexpr (cute::is_same_v) { + return CUDA_R_32I; + } else { + static_assert(sizeof(T) == 0, "Unsupported data type"); + } + + CUTE_GCC_UNREACHABLE; + } + + template + constexpr bool is_precision() { + return cute::is_same_v or cute::is_same_v>; + } + + template + constexpr cublasComputeType_t get_cublas_compute_type() { + if (is_precision()) { + return CUBLAS_COMPUTE_64F; + } else if (is_precision()) { + return CUBLAS_COMPUTE_32F_FAST_TF32; + } else if (is_precision()) { + return CUBLAS_COMPUTE_16F; + } else if (is_precision()) { + return CUBLAS_COMPUTE_32I; + } + + return CUBLAS_COMPUTE_32F; + } + + enum cublaslt_heuristic + { + run_default_best, + search_for_best + }; + + template + struct cublaslt_runner { + unsigned int result_size {}; + + cublasLtMatmulDesc_t operation_desc {}; + cublasLtMatmulPreference_t preference {}; + cublasLtMatrixLayout_t a_desc {}; + unsigned num_a_elements {}; + + cublasLtMatrixLayout_t b_desc {}; + unsigned num_b_elements {}; + + cublasLtMatrixLayout_t c_desc {}; + unsigned num_c_elements {}; + + cublasLtOrder_t cublas_order_a {}; + cublasLtOrder_t cublas_order_b {}; + cublasLtOrder_t cublas_order_c {}; + + cudaDataType_t cublas_data_type_a {}; + cudaDataType_t cublas_data_type_b {}; + cudaDataType_t cublas_data_type_c {}; + + cudaDataType_t cublas_scale_type {}; + cublasComputeType_t cublas_compute_type {}; + + cublasLtHandle_t lt_handle; + cublasLtMatmulHeuristicResult_t default_algorithm = {}; + + // Note: 32MB is the suggested size of workspace for cublasLt starting from Hopper arch + size_t workspace_size_in_bytes = 32 * 1024 * 1024; + mutable thrust::device_vector workspace_vector = thrust::device_vector(workspace_size_in_bytes); + + template + void initialize(GEMMShape gemm_shape, GEMMArr gemm_arr, GEMMLD gemm_ld) { + const auto [m, n, k] = gemm_shape; + const auto [lda, ldb, ldc] = gemm_ld; + const auto [arr_a, arr_b, arr_c] = gemm_arr; + + result_size = m * n; + + cublas_order_a = get_cublas_layout_order(arr_a); + cublas_order_b = get_cublas_layout_order(arr_b); + cublas_order_c = get_cublas_layout_order(arr_c); + + cublas_data_type_a = get_cublas_data_type(); + cublas_data_type_b = get_cublas_data_type(); + cublas_data_type_c = get_cublas_data_type(); + + CUBLAS_CHECK_AND_EXIT(cublasLtMatrixLayoutCreate(&a_desc, cublas_data_type_a, m, k, lda)); + CUBLAS_CHECK_AND_EXIT(cublasLtMatrixLayoutSetAttribute( + a_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &cublas_order_a, sizeof(cublas_order_a))); + num_a_elements = arr_a == cublasdx::col_major ? k * lda : m * lda ; + + CUBLAS_CHECK_AND_EXIT(cublasLtMatrixLayoutCreate(&b_desc, cublas_data_type_b, k, n, ldb)); + CUBLAS_CHECK_AND_EXIT(cublasLtMatrixLayoutSetAttribute( + b_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &cublas_order_b, sizeof(cublas_order_b))); + num_b_elements = arr_b == cublasdx::col_major ? n * ldb : k * ldb; + + CUBLAS_CHECK_AND_EXIT(cublasLtMatrixLayoutCreate(&c_desc, cublas_data_type_c, m, n, ldc)); + CUBLAS_CHECK_AND_EXIT(cublasLtMatrixLayoutSetAttribute( + c_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &cublas_order_c, sizeof(cublas_order_c))); + num_c_elements = arr_c == cublasdx::col_major ? n * ldc : m * ldc; + + cublas_scale_type = cublas_data_type_c; + cublas_compute_type = get_cublas_compute_type(); + CUBLAS_CHECK_AND_EXIT(cublasLtCreate(<_handle)); + + CUBLAS_CHECK_AND_EXIT(cublasLtMatmulDescCreate(&operation_desc, cublas_compute_type, cublas_scale_type)); + + // Heuristic utils + int returned_results = 0; + + CUBLAS_CHECK_AND_EXIT(cublasLtMatmulPreferenceCreate(&preference)); + CUBLAS_CHECK_AND_EXIT(cublasLtMatmulPreferenceSetAttribute(preference, + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &workspace_size_in_bytes, + sizeof(workspace_size_in_bytes))); + + CUBLAS_CHECK_AND_EXIT(cublasLtMatmulAlgoGetHeuristic(lt_handle, + operation_desc, + a_desc, + b_desc, + c_desc, + c_desc, + preference, + 1, + &default_algorithm, + &returned_results)); + + if (returned_results == 0) { + CUBLAS_CHECK_AND_EXIT(CUBLAS_STATUS_NOT_SUPPORTED); + } + } + + template + cublaslt_runner(GEMMShape gemm_shape, GEMMArr gemm_arr, GEMMLD gemm_ld) { + initialize(gemm_shape, gemm_arr, gemm_ld); + } + + template + cublaslt_runner(GEMMShape gemm_shape, GEMMArr gemm_arr) { + auto const [m, n, k] = gemm_shape; + auto const [arr_a, arr_b, arr_c] = gemm_arr; + auto const lda = (arr_a == cublasdx::col_major) ? m : k; + auto const ldb = (arr_b == cublasdx::col_major) ? k : n; + auto const ldc = (arr_c == cublasdx::col_major) ? m : n; + auto const gemm_ld = std::make_tuple(lda, ldb, ldc); + + initialize(gemm_shape, gemm_arr, gemm_ld); + } + + void execute(CComputeType const& alpha, + AComputeType const* a, + BComputeType const* b, + CComputeType const& beta, + CComputeType* c, + cudaStream_t stream = 0) const { + auto runner = [&](cudaStream_t stream) { + CUBLAS_CHECK_AND_EXIT( + cublasLtMatmul(lt_handle, + operation_desc, + reinterpret_cast(&alpha), + reinterpret_cast(a), + a_desc, + reinterpret_cast(b), + b_desc, + reinterpret_cast(&beta), + reinterpret_cast(c), + c_desc, + reinterpret_cast(c), + c_desc, + &default_algorithm.algo, + reinterpret_cast(thrust::raw_pointer_cast(workspace_vector.data())), + workspace_size_in_bytes, + stream)); + }; + + runner(stream); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + } + + [[nodiscard]] std::vector execute_with_results(CComputeType const& alpha, + AComputeType const* a, + BComputeType const* b, + CComputeType const& beta, + CComputeType* c, + cudaStream_t stream = 0) const { + std::vector results(result_size); + this->execute(alpha, a, b, beta, c, stream); + CUDA_CHECK_AND_EXIT( + cudaMemcpy(results.data(), c, results.size() * sizeof(CComputeType), cudaMemcpyDeviceToHost)); + + return results; + } + + [[nodiscard]] float execute_with_time(CComputeType const& alpha, + AComputeType const* a, + BComputeType const* b, + CComputeType const& beta, + CComputeType* c, + unsigned kernel_warm_up_repeats, + unsigned kernel_repeats, + cudaStream_t stream = 0) const { + // Find best algorithm + auto heuristic_runner = [&](auto algo, cudaStream_t stream) { + CUBLAS_CHECK_AND_EXIT( + cublasLtMatmul(lt_handle, + operation_desc, + reinterpret_cast(&alpha), + reinterpret_cast(a), + a_desc, + reinterpret_cast(b), + b_desc, + reinterpret_cast(&beta), + reinterpret_cast(c), + c_desc, + reinterpret_cast(c), + c_desc, + &algo, + reinterpret_cast(thrust::raw_pointer_cast(workspace_vector.data())), + workspace_size_in_bytes, + stream)); + }; + + constexpr int repeat_algo_check = 5; + const int requested_algo_count = 8; + cublasLtMatmulHeuristicResult_t heuristic_results[requested_algo_count] = {}; + int returned_results = 0; + int best_algo_index = 0; + float best_algo_time = 0; + + CUBLAS_CHECK_AND_EXIT(cublasLtMatmulAlgoGetHeuristic(lt_handle, + operation_desc, + a_desc, + b_desc, + c_desc, + c_desc, + preference, + requested_algo_count, + heuristic_results, + &returned_results)); + + if (returned_results == 0) { + CUBLAS_CHECK_AND_EXIT(CUBLAS_STATUS_NOT_SUPPORTED); + } + + // Cast away const, but data will only be reloaded for L2 reset + // no changes will actually happen to it + auto mutable_a = const_cast(a); + auto mutable_b = const_cast(b); + + // Prepare host data for L2 reset + std::vector host_a_data(num_a_elements); + std::vector host_b_data(num_b_elements); + std::vector host_c_data(num_c_elements); + + CUDA_CHECK_AND_EXIT(cudaMemcpy((void*)host_a_data.data(), + (void const*)a, + host_a_data.size() * sizeof(AComputeType), + cudaMemcpyDeviceToHost)); + CUDA_CHECK_AND_EXIT(cudaMemcpy((void*)host_b_data.data(), + (void const*)b, + host_b_data.size() * sizeof(BComputeType), + cudaMemcpyDeviceToHost)); + CUDA_CHECK_AND_EXIT(cudaMemcpy((void*)host_c_data.data(), + (void*)c, + host_c_data.size() * sizeof(CComputeType), + cudaMemcpyDeviceToHost)); + + for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) { + // Reset L2 cache + { + CUDA_CHECK_AND_EXIT(cudaMemcpy((void*)mutable_a, + (void*)host_a_data.data(), + host_a_data.size() * sizeof(AComputeType), + cudaMemcpyHostToDevice)); + CUDA_CHECK_AND_EXIT(cudaMemcpy((void*)mutable_b, + (void*)host_b_data.data(), + host_b_data.size() * sizeof(BComputeType), + cudaMemcpyHostToDevice)); + CUDA_CHECK_AND_EXIT(cudaMemcpy((void*)c, + (void*)host_c_data.data(), + host_c_data.size() * sizeof(CComputeType), + cudaMemcpyHostToDevice)); + } + + auto time = tutorial::measure::execution( + [&](auto stream) { heuristic_runner(heuristic_results[algo_idx].algo, stream); }, + 1 /* warm up runs*/, + repeat_algo_check /* kernel runs */, + stream); + + if (algo_idx == 0 || time < best_algo_time) { + best_algo_time = time; + best_algo_index = algo_idx; + } + } + + auto time_cublas = tutorial::measure::execution( + [&](auto stream) { heuristic_runner(heuristic_results[best_algo_index].algo, stream); }, + kernel_warm_up_repeats, + kernel_repeats, + stream); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + return time_cublas; + } + + [[nodiscard]] std::tuple> execute_with_time_and_results( + CComputeType const& alpha, + AComputeType const* a, + BComputeType const* b, + CComputeType const& beta, + CComputeType* c, + unsigned kernel_warm_up_repeats, + unsigned kernel_repeats, + cudaStream_t stream = 0) const { + auto results = this->execute_with_results(alpha, a, b, beta, c, stream); + auto time = this->execute_with_time(alpha, a, b, beta, c, kernel_warm_up_repeats, kernel_repeats, stream); + + return std::make_tuple(time, results); + } + + ~cublaslt_runner() { + if (preference) + CUBLAS_CHECK_AND_EXIT(cublasLtMatmulPreferenceDestroy(preference)); + if (c_desc) + CUBLAS_CHECK_AND_EXIT(cublasLtMatrixLayoutDestroy(c_desc)); + if (b_desc) + CUBLAS_CHECK_AND_EXIT(cublasLtMatrixLayoutDestroy(b_desc)); + if (a_desc) + CUBLAS_CHECK_AND_EXIT(cublasLtMatrixLayoutDestroy(a_desc)); + if (operation_desc) + CUBLAS_CHECK_AND_EXIT(cublasLtMatmulDescDestroy(operation_desc)); + } + }; +} // namespace example + +#endif // CUBLASDX_EXAMPLE_CUBLASLT_RUNNER_HPP diff --git a/tutorials/floating-point-emulation/cpp_source/include/reference/naive_reference.cu b/tutorials/floating-point-emulation/cpp_source/include/reference/naive_reference.cu new file mode 100644 index 00000000..d59444c7 --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/reference/naive_reference.cu @@ -0,0 +1,142 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define CUBLASDX_EXAMPLE_NO_THRUST +#include "naive_reference.hpp" + +namespace example { + + template + CUTE_HOST_DEVICE void dot(AccumType& accum, + cute::Tensor const& tensor_a, + cute::Tensor const& tensor_b) { + static_assert(cute::rank(decltype(tensor_a.layout()) {}) == cute::rank(decltype(tensor_b.layout()) {}) == 1); + for (unsigned int k_iter = 0; k_iter < cute::size<0>(tensor_a); ++k_iter) { + // Cast needed to emulate --> higher += lower * lower + accum += convert(tensor_a(k_iter)) * convert(tensor_b(k_iter)); + } + } + + template + __global__ void reference_gemm_naive_kernel(Alpha alpha, + cute::Tensor const tensor_a, + cute::Tensor const tensor_b, + Beta beta, + cute::Tensor tensor_c) { + + using value_type_c = typename CEngine::value_type; + + auto idx_m = blockIdx.x * blockDim.x + threadIdx.x; + auto idx_n = blockIdx.y * blockDim.y + threadIdx.y; + + if (idx_m < cute::size<0>(tensor_c) and idx_n < cute::size<1>(tensor_c)) { + value_type_c acc = convert(0.f); + dot(acc, tensor_a(idx_m, cute::_), tensor_b(cute::_, idx_n)); + tensor_c(idx_m, idx_n) = alpha * acc + beta * tensor_c(idx_m, idx_n); + } + } + + + template + void reference_gemm_naive_host(Alpha alpha, + cute::Tensor const& tensor_a, + cute::Tensor const& tensor_b, + Beta beta, + cute::Tensor& tensor_c) { + assert(cute::size<1>(tensor_a) == cute::size<0>(tensor_b)); + using value_type_c = typename CEngine::value_type; + + for (unsigned int idx_m = 0; idx_m < cute::size<0>(tensor_c); ++idx_m) { + for (unsigned int idx_n = 0; idx_n < cute::size<1>(tensor_c); ++idx_n) { + value_type_c acc = convert(0.f); + dot(acc, tensor_a(idx_m, cute::_), tensor_b(cute::_, idx_n)); + tensor_c(idx_m, idx_n) = alpha * acc + beta * tensor_c(idx_m, idx_n); + } + } + } + + template + void reference_gemm_naive_device(unsigned_tuple const& gemm_shape, + arr_tuple const& gemm_arr, + unsigned_tuple const& gemm_ld, + ValueType const& alpha, + device_vector const& A, + device_vector const& B, + ValueType const& beta, + device_vector& C) { + // Unpack arguments + auto [m, n, k] = gemm_shape; + auto [lda, ldb, ldc] = gemm_ld; + auto [arr_a, arr_b, arr_c] = gemm_arr; + + auto make_tensor = [](auto ptr, auto sx, auto sy, auto ld, bool col_major) { + return cute::make_tensor( + ptr, + cute::make_layout(cute::make_shape(sx, sy), cute::make_stride(col_major ? 1 : ld, col_major ? ld : 1))); + }; + + cute::Tensor tensor_a = make_tensor(A.data(), m, k, lda, arr_a == cublasdx::col_major); + cute::Tensor tensor_b = make_tensor(B.data(), k, n, ldb, arr_b == cublasdx::col_major); + cute::Tensor tensor_c = make_tensor(C.data(), m, n, ldc, arr_c == cublasdx::col_major); + + // Decide if device or host execution + const dim3 block_dim = {16, 16, 1}; + const dim3 grid_dim = {cute::ceil_div(m, block_dim.x), cute::ceil_div(n, block_dim.y), 1}; + reference_gemm_naive_kernel<<>>(alpha, tensor_a, tensor_b, beta, tensor_c); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + } + +#define REFERENCE_FOR_TYPE(Prec) \ + template void reference_gemm_naive_device(unsigned_tuple const& gemm_shape, \ + arr_tuple const& gemm_arr, \ + unsigned_tuple const& gemm_ld, \ + Prec const& alpha, \ + device_vector const& A, \ + device_vector const& B, \ + Prec const& beta, \ + device_vector& C); \ + \ + template void reference_gemm_naive_device>( \ + unsigned_tuple const& gemm_shape, \ + arr_tuple const& gemm_arr, \ + unsigned_tuple const& gemm_ld, \ + cublasdx::complex const& alpha, \ + device_vector> const& A, \ + device_vector> const& B, \ + cublasdx::complex const& beta, \ + device_vector>& C); + + REFERENCE_FOR_TYPE(double) + REFERENCE_FOR_TYPE(int64_t) + REFERENCE_FOR_TYPE(uint64_t) + +} // namespace example diff --git a/tutorials/floating-point-emulation/cpp_source/include/reference/naive_reference.hpp b/tutorials/floating-point-emulation/cpp_source/include/reference/naive_reference.hpp new file mode 100644 index 00000000..373c4840 --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/reference/naive_reference.hpp @@ -0,0 +1,39 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CUBLASDX_EXAMPLE_NAIVE_REFERENCE_HPP +#define CUBLASDX_EXAMPLE_NAIVE_REFERENCE_HPP + +#include +#include "../common/common.hpp" + +namespace example { + using unsigned_tuple = cute::tuple; + using arr_tuple = cute::tuple; + + template + void reference_gemm_naive_device(unsigned_tuple const& gemm_shape, + arr_tuple const& gemm_arr, + unsigned_tuple const& gemm_ld, + ValueType const& alpha, + device_vector const& A, + device_vector const& B, + ValueType const& beta, + device_vector& C); +} // namespace example + +#endif // CUBLASDX_EXAMPLE_NAIVE_REFERENCE_HPP diff --git a/tutorials/floating-point-emulation/cpp_source/include/reference/reference.hpp b/tutorials/floating-point-emulation/cpp_source/include/reference/reference.hpp new file mode 100644 index 00000000..b521743a --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/reference/reference.hpp @@ -0,0 +1,138 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cublaslt_runner.hpp" +#include "check_error.hpp" +#include + +namespace tutorial { + + template + auto cublaslt_reference(Alpha alpha, + ATensor const& tensor_a, + BTensor const& tensor_b, + Beta beta, + CTensor& tensor_c, + cudaStream_t stream = 0, + int kernel_warm_up_repeats = 10, + int kernel_repeats = 100) { + + using cublas_a_value_type = tensor_value_type_t; + using cublas_b_value_type = tensor_value_type_t; + using cublas_c_value_type = tensor_value_type_t; + + assert(cute::size<0>(tensor_a) == cute::size<0>(tensor_c)); // Check M + assert(cute::size<1>(tensor_b) == cute::size<1>(tensor_c)); // Check N + assert(cute::size<1>(tensor_a) == cute::size<0>(tensor_b)); // Check K + + int size_m = cute::size<0>(tensor_a); + int size_n = cute::size<1>(tensor_b); + int size_k = cute::size<1>(tensor_a); + + auto global_shape = cute::make_tuple(size_m, size_n, size_k); + + assert(cute::stride<0>(tensor_a) == 1 or cute::stride<1>(tensor_a) == 1); // Verify if A is either Row/Col major + assert(cute::stride<0>(tensor_b) == 1 or cute::stride<1>(tensor_b) == 1); // Verify if B is either Row/Col major + assert(cute::stride<0>(tensor_c) == 1 or cute::stride<1>(tensor_c) == 1); // Verify if C is either Row/Col major + + auto arr_a = + (cute::stride<0>(tensor_a) == 1) ? cublasdx::arrangement::col_major : cublasdx::arrangement::row_major; + auto arr_b = + (cute::stride<0>(tensor_b) == 1) ? cublasdx::arrangement::col_major : cublasdx::arrangement::row_major; + auto arr_c = + (cute::stride<0>(tensor_c) == 1) ? cublasdx::arrangement::col_major : cublasdx::arrangement::row_major; + auto global_arrangement = cute::make_tuple(arr_a, arr_b, arr_c); + + auto [time_cublas, results_cublas] = + example::cublaslt_runner(global_shape, + global_arrangement) + .execute_with_time_and_results(alpha, + cute::raw_pointer_cast(tensor_a.data()), + cute::raw_pointer_cast(tensor_b.data()), + beta, + cute::raw_pointer_cast(tensor_c.data()), + kernel_warm_up_repeats, + kernel_repeats, + stream); + + auto tflops_cublas = real_gemm_tflops(size_m, size_n, size_k) / time_cublas; + + return cuda::std::make_tuple(time_cublas, tflops_cublas, results_cublas); + } + + template + auto cublaslt_reference(double alpha, + ATensor const& tensor_a, + double beta, + CTensor& tensor_c, + matrix_half output_half, + cudaStream_t stream = 0, + int kernel_warm_up_repeats = 10, + int kernel_repeats = 100) { + // Convert arguments to cuBLAS format + // + cublasHandle_t handle; + cublasCreate(&handle); + cublasSetStream(handle, stream); + + int n = cute::get<0>(tensor_a.layout().shape()); + int k = cute::get<1>(tensor_a.layout().shape()); + + bool const is_c_row_major = cute::get<1>(tensor_c.stride()) == 1; + auto const reversed_output_half = output_half == matrix_half::lower ? matrix_half::upper : matrix_half::lower; + auto const effective_output_half = is_c_row_major ? reversed_output_half : output_half; + auto const cublas_fill_mode = + (effective_output_half == matrix_half::lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + auto const cublas_a_trans = (cute::get<0>(tensor_a.stride()) == 1) ? CUBLAS_OP_N : CUBLAS_OP_T; + auto const cublas_lda = + (cublas_a_trans == CUBLAS_OP_N) ? cute::get<0>(tensor_a.shape()) : cute::get<1>(tensor_a.shape()); + auto const cublas_ldc = cute::get<0>(tensor_c.layout().shape()); + + auto run_cublas = [&](cudaStream_t&) { + cublasDsyrk(handle, + cublas_fill_mode, + cublas_a_trans, + n, + k, + &alpha, + raw_pointer_cast(tensor_a.data()), + cublas_lda, + &beta, + raw_pointer_cast(tensor_c.data()), + cublas_ldc); + }; + + // Run cuBLAS for correctness + run_cublas(stream); + // Copy results + std::vector results(tensor_c.size()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(results.data(), + raw_pointer_cast(tensor_c.data()), + tensor_c.size() * sizeof(double), + cudaMemcpyDeviceToHost)); + + // Measure performance + auto time_cublas = measure::execution(run_cublas, kernel_warm_up_repeats, kernel_repeats, stream); + auto tflops_cublas = real_syrk_tflops(n, k) / time_cublas; + + // Clean up and return + cublasDestroy(handle); + return cuda::std::make_tuple(time_cublas, tflops_cublas, results); + } +} // namespace tutorial diff --git a/tutorials/floating-point-emulation/cpp_source/include/tensor_helpers.hpp b/tutorials/floating-point-emulation/cpp_source/include/tensor_helpers.hpp new file mode 100644 index 00000000..6cdbd7ba --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/tensor_helpers.hpp @@ -0,0 +1,155 @@ +#pragma once + +// For CuTe Tensor types +#include + +// For required cuda::std types +#include +#include +#include + +namespace tutorial { + + template + struct tensor_value_type; + + template + struct tensor_value_type> { + using type = typename Engine::value_type; + }; + + template + using tensor_value_type_t = typename tensor_value_type::type; + + + namespace detail { + + template + CUBLASDX_HOST_DEVICE auto convert_to_cute_tuple_element(Element const& elem) { + static_assert(cuda::std::is_integral_v, "Only flat integral tuples are supported"); + return elem; + } + + template + CUBLASDX_HOST_DEVICE auto convert_to_cute_tuple_element(cuda::std::integral_constant) { + static_assert(cuda::std::is_integral_v, "Only flat integral tuples are supported"); + return cute::Int {}; + } + + template + CUBLASDX_HOST_DEVICE auto convert_to_cute_tuple_element(cute::C) { + static_assert(cuda::std::is_integral_v, "Only flat integral tuples are supported"); + return cute::Int {}; + } + + template + CUBLASDX_HOST_DEVICE auto convert_to_cute_tuple(cuda::std::tuple const& std_tuple, + cuda::std::integer_sequence) { + return cute::make_tuple(convert_to_cute_tuple_element(cuda::std::get(std_tuple))...); + } + + template + CUBLASDX_HOST_DEVICE auto convert_to_cute_tuple(cuda::std::tuple const& std_tuple) { + constexpr unsigned num_elems = sizeof...(TupleArgs); + return convert_to_cute_tuple(std_tuple, cuda::std::make_integer_sequence()); + } + } // namespace detail + + template + CUBLASDX_HOST_DEVICE auto make_gmem_tensor_from_tuples(PointerType* pointer_type, + cuda::std::tuple const& shape, + cuda::std::tuple const& stride) { + + auto cute_shape = detail::convert_to_cute_tuple(shape); + auto cute_stride = detail::convert_to_cute_tuple(stride); + auto cute_layout = cute::make_layout(cute_shape, cute_stride); + + return cute::make_tensor(cute::make_gmem_ptr(pointer_type), cute_layout); + } + + template + CUBLASDX_HOST_DEVICE auto make_layout_from_tuples(cuda::std::tuple const& shape, + cuda::std::tuple const& stride) { + + auto cute_shape = detail::convert_to_cute_tuple(shape); + auto cute_stride = detail::convert_to_cute_tuple(stride); + return cute::make_layout(cute_shape, cute_stride); + } + + template + struct is_integral: cuda::std::is_integral {}; + + template + struct is_integral>: cuda::std::true_type {}; + + template + inline constexpr bool is_integral_v = is_integral::value; + + template + constexpr auto make_order(cuda::std::integer_sequence) { + auto col_major_order = cute::Step...> {}; + auto row_major_order = cute::Step...> {}; + + return cute::conditional_return(col_major_order, row_major_order); + } + + template + auto get_empty_device_tensor(Dimensions... dimensions) { + + static_assert((is_integral_v && ...)); + auto const total_size = (dimensions * ...); + thrust::device_vector device_vector(total_size); + + auto iter = cute::make_gmem_ptr(thrust::raw_pointer_cast(device_vector.data())); + auto const shape = cute::make_shape(detail::convert_to_cute_tuple_element(dimensions)...); + auto const stride_atom = + make_order(cuda::std::make_integer_sequence()); + auto const layout = cute::make_ordered_layout(shape, stride_atom); + auto tensor = cute::make_tensor(iter, layout); + + return cuda::std::make_tuple(std::move(device_vector), tensor); + } + + template + CUBLASDX_HOST_DEVICE auto make_gmem_tensor_from_tuples(PointerType* pointer_type, + cuda::std::tuple const& shape) { + + auto cute_shape = detail::convert_to_cute_tuple(shape); + + auto const stride_atom = + make_order(cuda::std::make_integer_sequence()); + auto const layout = cute::make_ordered_layout(cute_shape, stride_atom); + + return cute::make_tensor(cute::make_gmem_ptr(pointer_type), layout); + } + + template + __host__ __device__ __forceinline__ auto make_smem_tensor(Iterator* iterator) { + auto iter = cute::make_smem_ptr(iterator); + auto shape = cute::Shape, cute::Int> {}; + auto stride_atom = cute::conditional_return( + cute::LayoutLeft {}, cute::LayoutRight {}); + + return cute::make_tensor(iter, cute::make_layout(shape, stride_atom)); + } + + template + auto get_copy_tensor(Tensor old_tensor) { + using tensor_value_type = tensor_value_type_t; + auto tensor_elems = size(old_tensor.layout()); + thrust::device_vector device_vector(tensor_elems); + CUDA_CHECK_AND_EXIT(cudaMemcpy(thrust::raw_pointer_cast(device_vector.data()), + raw_pointer_cast(old_tensor.data()), + tensor_elems * sizeof(tensor_value_type), + cudaMemcpyDeviceToDevice)); + + auto iter = cute::make_gmem_ptr(thrust::raw_pointer_cast(device_vector.data())); + auto tensor = cute::make_tensor(iter, old_tensor.layout()); + + return cuda::std::make_tuple(std::move(device_vector), tensor); + } + + using cute::conditional_return; + using cute::raw_pointer_cast; + using cute::size; +} // namespace tutorial diff --git a/tutorials/floating-point-emulation/cpp_source/include/tutorial_helpers.hpp b/tutorials/floating-point-emulation/cpp_source/include/tutorial_helpers.hpp new file mode 100644 index 00000000..00da55cb --- /dev/null +++ b/tutorials/floating-point-emulation/cpp_source/include/tutorial_helpers.hpp @@ -0,0 +1,78 @@ +#pragma once + +#include +#include +#include + +#include +#include + +#include + +// Utilities for Random Number Generation +#include "random.hpp" +// Utilities for numerical conversions +#include "numerical.hpp" +// Utilities for stable performance measurement +#include "performance_measurement.hpp" +// Helpers types and utilities for matmul performance +#include "performance_measurement.hpp" +// Helpers for CUDA Runtime +#include "cuda_utilities.hpp" +// Intermediate layer for managing layouts and tensors +#include "tensor_helpers.hpp" +// cuBLASLt reference with performance autotuning +#include "reference/reference.hpp" + +namespace tutorial { + + struct gemm_problem_t { + int m; + int n; + int k; + double alpha; + double beta; + }; + + struct syrk_problem_t { + int n; + int k; + double alpha; + double beta; + matrix_half uplo; + }; + + void print_device_properties() { + cudaDeviceProp prop; + int sm_clock, mem_clock; + + int device_count = 0; + CUDA_CHECK_AND_EXIT(cudaGetDeviceCount(&device_count)); + + std::stringstream ss; + ss << "Number of CUDA devices: " << device_count << std::endl << std::endl; + + for (auto device_id = 0; device_id < device_count; device_id++) { + CUDA_CHECK_AND_EXIT(cudaGetDeviceProperties(&prop, device_id)); + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&sm_clock, cudaDevAttrClockRate, device_id)); + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&mem_clock, cudaDevAttrMemoryClockRate, device_id)); + + ss << "Device " << device_id << ": " << prop.name << std::endl; + ss << " Compute capability: " << prop.major << "." << prop.minor << std::endl; + ss << " Total global memory: " << (prop.totalGlobalMem >> 20) << " MB" << std::endl; + ss << " Multiprocessors: " << prop.multiProcessorCount << std::endl; + ss << " Max threads per block: " << prop.maxThreadsPerBlock << std::endl; + ss << " Max threads per multiprocessor: " << prop.maxThreadsPerMultiProcessor << std::endl; + ss << " Warp size: " << prop.warpSize << std::endl; + + ss << " Clock Rate: " << sm_clock / 1000.f << " MHz" << std::endl; + ss << " Memory Clock Rate: " << mem_clock / 1000.f << " MHz" << std::endl; + + ss << " Memory Bus Width: " << prop.memoryBusWidth << " bits" << std::endl; + ss << std::endl; + } + + std::cout << ss.str(); + } + +} // namespace tutorial diff --git a/tutorials/floating-point-emulation/notebooks/01-Introduction/01-Introduction.ipynb b/tutorials/floating-point-emulation/notebooks/01-Introduction/01-Introduction.ipynb new file mode 100644 index 00000000..5a5c3629 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/01-Introduction/01-Introduction.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "35320b28-7dc6-48dc-a502-96a11eaa9583", + "metadata": {}, + "outputs": [], + "source": [ + "# SPDX-License-Identifier: Apache-2.0 AND CC-BY-NC-4.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "id": "0c01caf9-afbb-4769-a2b6-0979ed0c2b80", + "metadata": { + "tags": [] + }, + "source": [ + "## Introduction\n", + "\n", + "### Prerequsities\n", + "\n", + "To get the most out of this lab you should already be able to:\n", + " - Declare variables, write loops, and use if / else statements in C++\n", + " - Use pointer and iterators to access arrays of data\n", + " - Use lambda expressions (unnamed functions)\n", + " - Write basic CUDA kernels and understand core conecepts like threads, blocks, and grids\n", + " - If you are new to CUDA, we recommend the [CUDA C++ Tutorial](https://github.com/NVIDIA/accelerated-computing-hub/tree/main/tutorials/cuda-cpp)\n", + "\n", + "\n", + "On the infrastructure side, this lab assumes that you:\n", + " - Have the latest (13.1+) [CUDA toolkit](https://developer.nvidia.com/cuda-downloads) installed\n", + " - Have latest MathDx package (25.12.1) downloaded into your system (preferably installed to `/opt/nvidia/mathdx/25.12`)\n", + " - Have an NVIDIA GPU installed in your system\n", + " - Are running on a Linux system\n", + "\n", + "If you are using Windows, try WSL (Windows Subsystem for Linux) or file an issue for Windows support\n", + "\n", + "___\n", + "\n", + "### Objectives\n", + "\n", + "By the time you complete this lab, you will be able to:\n", + "\n", + " - Understand key performance characteristics and parameters in GEMM kernels\n", + " - Be able to write performant and portable CUDA kernels that leverage Tensor Cores and TMA\n", + " - Understand kernel fusion and directly see how it can improve performance\n", + " - Understand how to emulate FP64 matrix multiplication with the Ozaki-I Scheme \n", + " - Understand how to simplify kernel development and achieve higher performance with a tile-based programming model'\n", + "\n", + "___\n", + "\n", + "### Content\n", + "\n", + " - Matrix Multiplication Fundamentals\n", + " - **Exercise 2.1:** A simple DGEMM Kernel\n", + " - **Exercise 2.2:** Improving DGEMM Performance with Shared Memory tiling\n", + " - **Exercise 2.3:** Improving DGEMM Performance with pipelining\n", + " - Matrix Multiplication with cuBLASDx\n", + " - **Challenge Exercise 2.4:** Implementing Pipelining with cublasDx \n", + " - Ozaki-I Emulation\n", + " - **Exercise 3.1**: IGEMM Based Ozaki-I Scheme\n", + " - **Exercise 3.2**: Optimizing Ozaki-I With Fusion\n", + " - **Exercise 3.3**: A Fully Fused Ozaki-I Implementation\n", + " - Challenge Exercise:\n", + " - **Challenge Exercise 4.1**: DSYRK using Ozaki-I\n", + "\n", + "___\n", + "\n", + "### Why Device Extension Libraries?\n", + "\n", + "In short, programming with Tensor Cores is difficult. Each Tensor Core generation changes calling conventions, requires different synchronization patterns, requires different memory layouts, and even reads and writes to different memory subsystems. Device extension libraries provide an interface that is stable across GPU generations and provides performant access to Tensor Cores. In this lab, we will learn how to leverage device extension libraries, specifically cuBLASDx, to implement the Ozaki-I scheme. The kernels we write will work on Volta and newer GPU's without code modification and can reach Speed of Light (SOL) performance on all of them.\n", + "\n", + "The cuBLASDx library provides a tile-based programming model to which makes writing performant Tensor Core kernels as easy as it has ever been. A tile-based programming model allows users to define the compuation at a higher level and let library developers handle low level details like thread mapping, memory hierarchies, synchronization, and TMA.\n", + "\n", + "In this course, you will learn how to leverage cuBLASDx to write performant kernels for complex algorithms by using FP64 Emulation with the Ozaki-I Scheme as a testbed to understand cuBLASDx." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da50f139-2b98-45b7-906d-045f562d49be", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/02.01-MatmulFundamentals.ipynb b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/02.01-MatmulFundamentals.ipynb new file mode 100644 index 00000000..35c98c22 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/02.01-MatmulFundamentals.ipynb @@ -0,0 +1,1550 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "45494e75-0bb2-4116-a92e-72bd2d5466f6", + "metadata": {}, + "source": [ + "# Matrix Multiplication Fundamentals\n", + "\n", + "A GEMM (General Matrix Multiply) operation takes the form $C = \\alpha \\mathbf{A}\\mathbf{B} + \\beta\\mathbf{C}$ where $\\alpha, \\beta$ are scalars, $\\mathbf{A}$ is an $m \\times k$ matrix, $\\mathbf{B}$ is a $k \\times n$ matrix, and $\\mathbf{C}$ is a $m \\times n$ matrix.\n", + "\n", + "The element at row $i$ and column $j$ of matrix $\\mathbf{C}$ is calculated as the scaled and biased dot product of row $i$ of $\\mathbf{A}$ and column $j$ of $\\mathbf{B}$ as follows:\n", + "\n", + "$$\n", + "\\mathbf{C}_{i, j} = \\alpha \\left(\\sum_{l=0}^{k} \\mathbf{A}_{i, l} \\mathbf{B}_{l, j} \\right) + \\beta \\mathbf{C}_{i, j}\n", + "$$\n", + "\n", + "In implementation the above operation is usually split into 2 parts:\n", + "1. **Matrix Multiplication** itself, computing $ \\mathbf{D}_{i, j} = \\sum_{l=0}^{k} \\mathbf{A}_{i, l} \\mathbf{B}_{l, j} $\n", + "2. **Epilogue**, computing $ \\mathbf{C}_{i, j} = \\alpha \\cdot \\mathbf{D}_{i, j} + \\beta \\cdot \\mathbf{C}_{i, j} $" + ] + }, + { + "cell_type": "markdown", + "id": "a4cd8594-9e87-406a-92c9-2386aaef7168", + "metadata": {}, + "source": [ + "## Exercise Setup" + ] + }, + { + "cell_type": "markdown", + "id": "df324ba7-6ef8-4939-86e5-c94da28db3e5", + "metadata": {}, + "source": [ + "### C++ - CMake configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a6e8fd3-94e1-421b-9401-fd6cdf059863", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import sys, os\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "from common_cuda import setup_cmake_project\n", + "\n", + "# A python cmake wrapper to determine the GPU architecture and compile for only that\n", + "setup_cmake_project()" + ] + }, + { + "cell_type": "markdown", + "id": "aac3e422-bcf5-416c-aa6b-824e830a0acb", + "metadata": {}, + "source": [ + "### Python - Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc473f40-e19a-4956-b7d4-3cd0f6c6a9ad", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "import numpy as np\n", + "import cupy as cp\n", + "import nvmath\n", + "\n", + "from nvmath.device import Matmul\n", + "from nvmath.device.cublasdx import DevicePipeline, SharedStorageCalc, MAX_ALIGNMENT\n", + "from nvmath.device.cublasdx_numba import pipeline_extensions\n", + "from nvmath.device.common import axpby, clear, copy, copy_fragment, copy_wait, make_tensor\n", + "from numba import cuda\n", + "\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "\n", + "from benchmark import *" + ] + }, + { + "cell_type": "markdown", + "id": "2a5cf955-0389-427c-8f82-43680f5be3f9", + "metadata": {}, + "source": [ + "## Exercise 2.1: Naive DGEMM Kernel\n", + "\n", + "In this exercise, we will implement a naive GEMM algorithm by having each CUDA thread calculate one element in our C matrix:\n", + "\n", + "\n", + "\n", + "This diagram shows that we compute element (0, 0) of the C matrix by calculating the dot product between row 0 of the A matrix and column 0 of the B matrix. This can be done by iterating along the K dimension and at each step multiplying i'th element of the row of A and column of B and accumulating the results." + ] + }, + { + "cell_type": "markdown", + "id": "f035895b-38b4-4fe9-86a7-006620ac1310", + "metadata": { + "tags": [] + }, + "source": [ + "### C++" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "294f3a97-41b2-4f09-9091-21d39e6a9a5c", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1a/parameters.hpp.inc\n", + "\n", + " // (gemm_m, gemm_n, gemm_k, alpha, beta)\n", + " std::vector problems = {\n", + " {2048, 2048, 2048, 0.9, 1.1}\n", + " };" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1401fb9b-bcf5-4921-a4ab-f615912bdc32", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1a/kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(BlockSize, 1) __global__ void kernel_1a_simple_dgemm(double alpha,\n", + " TensorA const tensor_a,\n", + " TensorB const tensor_b,\n", + " double beta,\n", + " TensorC tensor_c) {\n", + " int const thread_row_idx = threadIdx.x + blockIdx.x * blockDim.x;\n", + " int const thread_col_idx = threadIdx.y + blockIdx.y * blockDim.y;\n", + "\n", + " auto [size_m, size_n] = tensor_c.shape();\n", + " auto size_k = tutorial::size<1>(tensor_a);\n", + "\n", + " // EXERCISE --> What are the conditions for early exit?\n", + " if (...) {\n", + " return;\n", + " }\n", + "\n", + " double accumulator = 0.0;\n", + "\n", + " // EXERCISE --> Complete the following implementation to compute the dot product between row 'thread_row_idx' of matrix A \n", + " // and the column of 'thread_col_idx' of matrix B.\n", + " for (...) {\n", + " accumulator += ...;\n", + " }\n", + "\n", + " // We can use the tensor object to do 2D indexing as follows:\n", + " double c_elem = tensor_c(thread_row_idx, thread_col_idx);\n", + "\n", + " // HINT: Remember that we are doing a GEMM operation (see above)\n", + " tensor_c(thread_row_idx, thread_col_idx) = ...;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e39a2947-b7d9-4351-bbac-7a911dee35fa", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1a/kernel_parameters.hpp.inc\n", + " // If you have time, try a few different block dimensions and see how the performance changes.\n", + "\n", + " // Setup kernel configuration\n", + " int const block_dim_x = 16;\n", + " int const block_dim_y = 16;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19b78dc4-0691-4aa3-b357-cb97beed15f4", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build build/ -t 1a_simple_dgemm_tensor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ae6502a-810d-4c9d-8fff-85339aab5e99", + "metadata": {}, + "outputs": [], + "source": [ + "!./build/1a_simple_dgemm_tensor" + ] + }, + { + "cell_type": "markdown", + "id": "ad532e04-e70a-40cc-8cc7-19589db4b9bc", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "markdown", + "id": "3dd350ee-f00d-40f4-8731-cbc2dc73a672", + "metadata": {}, + "source": [ + "We will rewrite kernel now and recompile the solution. If you want to restart your exercise make sure you rewrite kernel back and recompile it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bb10bd7-4e7a-40b9-b3ee-c711a1252a49", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1a/kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(BlockSize, 1) __global__ void kernel_1a_simple_dgemm(double alpha,\n", + " TensorA const tensor_a,\n", + " TensorB const tensor_b,\n", + " double beta,\n", + " TensorC tensor_c) {\n", + " int const thread_row_idx = threadIdx.x + blockIdx.x * blockDim.x;\n", + " int const thread_col_idx = threadIdx.y + blockIdx.y * blockDim.y;\n", + "\n", + " auto [size_m, size_n] = tensor_c.shape();\n", + " auto size_k = tutorial::size<1>(tensor_a);\n", + "\n", + " if (thread_row_idx >= size_m || thread_col_idx >= size_n) {\n", + " return;\n", + " }\n", + "\n", + " double accumulator = 0.0;\n", + "\n", + " // EXERCISE --> Complete the following implementation to compute the dot product between row 'thread_row_idx' of matrix A \n", + " // and the column of 'thread_col_idx' of matrix B.\n", + " for (int i = 0; i < size_k; i++) {\n", + " accumulator += tensor_a(thread_row_idx, i) * tensor_b(i, thread_col_idx);\n", + " }\n", + "\n", + " // We can use the tensor object to do 2D indexing as follows:\n", + " double c_elem = tensor_c(thread_row_idx, thread_col_idx);\n", + "\n", + " // HINT: Remember that we are doing a GEMM operation (see above)\n", + " tensor_c(thread_row_idx, thread_col_idx) = alpha * accumulator + beta * c_elem;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b61004b-fe28-4b94-9b3f-cb39ed987351", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build build/ -t 1a_simple_dgemm_tensor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64d9d43b-8644-42a4-8555-d599223369f0", + "metadata": {}, + "outputs": [], + "source": [ + "!./build/1a_simple_dgemm_tensor" + ] + }, + { + "cell_type": "markdown", + "id": "cfaf685f-c367-4655-a9c3-ef982c9cb112", + "metadata": {}, + "source": [ + "### Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a4c1522-c87b-4421-9ad3-a01b49f61ec3", + "metadata": {}, + "outputs": [], + "source": [ + "# The problems that we will benchmark and conduct accuracy tests on the tuple should be formed as:\n", + "# (GEMM_M, GEMM_N, GEMM_K, ALPHA, BETA)\n", + "problems = [\n", + " (2048, 2048, 2048, 0.9, 1.1),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d9181c-8fed-4c01-9727-4859cdf1a3df", + "metadata": {}, + "outputs": [], + "source": [ + "def get_2_1_dgemm_kernel(block_x = 16, block_y = 16):\n", + " block_size = block_x * block_y\n", + " \n", + " @cuda.jit(launch_bounds=(block_size, 1))\n", + " def dgemm_kernel(alpha, tensor_a, tensor_b, beta, tensor_c):\n", + " m, n = tensor_c.shape\n", + " _, k = tensor_a.shape\n", + "\n", + " thread_row_idx = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n", + " thread_col_idx = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y\n", + "\n", + " # EXERCISE --> What are the conditions for early exit?\n", + " #if :\n", + " # return\n", + "\n", + " accumulator = 0.0\n", + "\n", + " # EXERCISE --> Complete the following implementation to compute the dot product between row 'thread_row_idx' of matrix A \n", + " # and the column of 'thread_col_idx' of matrix B.\n", + " #for ...:\n", + " # accumulator += ...\n", + " \n", + " # We can use the tensor object to do 2D indexing as follows:\n", + " c_elem = tensor_c[thread_row_idx, thread_col_idx]\n", + " \n", + " # HINT: Remember that we are doing a GEMM operation (see above)\n", + " #tensor_c[thread_row_idx, thread_col_idx] = ...;\n", + " \n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94253b37-b532-4c18-9832-2ccf7cdffc20", + "metadata": {}, + "outputs": [], + "source": [ + "def choose_kernel_params_2_1(m, n, k, alpha, beta):\n", + " # EXERCISE --> Try a few different block dimensions. A few questions to think about:\n", + " # How does performance change if they are not powers of 2?\n", + " # How does performance change with rectangular shapes? What if you change the gemm problem?\n", + " return 16, 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34ff1fab-15fc-4398-8f55-10b3d3d0a3d3", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_dgemm_2_1(problems, get_2_1_dgemm_kernel, choose_kernel_params_2_1)" + ] + }, + { + "cell_type": "markdown", + "id": "ca3bd916-9f25-40fd-8855-ccd4f3f48462", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34fd791e-61dc-4e7c-ab02-720b7d346b2e", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "outputs": [], + "source": [ + "def get_2_1_dgemm_kernel_solution(block_x = 16, block_y = 16):\n", + " block_size = block_x * block_y\n", + " \n", + " @cuda.jit(launch_bounds=(block_size, 1))\n", + " def dgemm_kernel(alpha, tensor_a, tensor_b, beta, tensor_c):\n", + " m, n = tensor_c.shape\n", + " _, k = tensor_a.shape\n", + "\n", + " thread_row_idx = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n", + " thread_col_idx = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y\n", + "\n", + " if thread_row_idx >= m or thread_col_idx >= n:\n", + " return\n", + "\n", + " accumulator = 0.0\n", + "\n", + " # EXERCISE --> Complete the following implementation to compute the dot product between row 'thread_row_idx' of matrix A \n", + " # and the column of 'thread_col_idx' of matrix B.\n", + " for i in range(k):\n", + " accumulator += tensor_a[thread_row_idx, i] * tensor_b[i, thread_col_idx]\n", + " \n", + " # We can use the tensor object to do 2D indexing as follows:\n", + " c_elem = tensor_c[thread_row_idx, thread_col_idx]\n", + " \n", + " # HINT: Remember that we are doing a GEMM operation (see above)\n", + " tensor_c[thread_row_idx, thread_col_idx] = alpha * accumulator + beta * c_elem;\n", + " \n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3475a45-27de-4a23-a4a0-f67e830e1562", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_dgemm_2_1(problems, get_2_1_dgemm_kernel_solution, choose_kernel_params_2_1)" + ] + }, + { + "cell_type": "markdown", + "id": "683921e9-ff62-46e6-af52-0ee9e1ff5919", + "metadata": {}, + "source": [ + "### Analyzing Naive GEMM Performance\n", + "\n", + "Our kernel is expectedly performing just okay but how can we understand if this is an algorithmic or implementation limitation? Let's breifly analyze our implementation using a [roofline model](https://en.wikipedia.org/wiki/Roofline_model):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17d172ef-8d28-4989-854a-402f63669871", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "# TFLOPS, MEMORY BANDWIDTH (GB/s)\n", + "GPU_SPECS = {\n", + " \"L40S\": (1.43, 864),\n", + " \"B200\": (37, 6200)\n", + "}\n", + "\n", + "def roofline_prediction_2_1(m, n, k):\n", + " FP64_TFLOPS, MEMORY_BANDWIDTH_GBS = GPU_SPECS[\"L40S\"]\n", + "\n", + " # By design since each thread is computing one output element\n", + " threads = m * n\n", + "\n", + " # Each dot product consists of k multiplications and k adds \n", + " flops_per_thread = 2 * k\n", + "\n", + " fp64_size = np.dtype(np.float64).itemsize\n", + "\n", + " # We load a row of matrix A, a column of matrix B, and read from / write to matrix C\n", + " memory_per_thread = (2 * k + 2) * fp64_size\n", + "\n", + " total_memory_gb = threads * memory_per_thread * 1e-9\n", + " total_tflop = threads * flops_per_thread * 1e-12\n", + "\n", + " return total_tflop / FP64_TFLOPS, total_memory_gb / MEMORY_BANDWIDTH_GBS\n", + "\n", + "time_flops, time_membw = roofline_prediction_2_1(2048, 2048, 2048)\n", + "\n", + "print(f\"The runtime from the math operations {time_flops * 1e3} ms and the runtime from memory is {time_membw * 1e3} ms\")\n", + "\n", + "# We will either be bottlenecked by FLOPS or Memory Bandwidth, so we take the maximum\n", + "print(f\"Therefore, the estimated best case runtime is {max(time_flops, time_membw) * 1e3} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "692deaa6-e830-4d64-9561-1ac32f03d451", + "metadata": {}, + "source": [ + "We can see that our kernel is performing roughly as we'd expect with some small improvements which are probably due to the hardware caching inputs it's already seen and GPU latency hiding. Now that we know that memory is the main bottleneck, we further optimize memory movement in the next exercise.\n", + "\n", + "Information regarding certain GPUs' capabilities can be found in official datasheets, you can see some examples here:\n", + "- [NVIDIA A100](https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-nvidia-us-2188504-web.pdf)\n", + "- [NVIDIA L40](https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/support-guide/NVIDIA-L40-Datasheet-January-2023.pdf)\n", + "- [NVIDIA L40s](https://images.nvidia.com/content/Solutions/data-center/vgpu-L40-datasheet.pdf)\n", + "- [NVIDIA Blackwell RTX Pro](https://www.nvidia.com/content/dam/en-zz/Solutions/data-center/rtx-pro-6000-blackwell-workstation-edition/workstation-blackwell-rtx-pro-6000-workstation-edition-nvidia-us-3519208-web.pdf)\n", + "- [NVIDIA B200](https://resources.nvidia.com/en-us-blackwell-architecture)" + ] + }, + { + "cell_type": "markdown", + "id": "27106acc-03df-41f6-891e-3d2cd96d32d8", + "metadata": {}, + "source": [ + "## Exercise 2.2: Improving DGEMM with Shared Memory Tiling\n", + "\n", + "In this exercise, we will discuss the memory subsystem of GPUs and leverage these hardware properties for further optimization.\n", + "\n", + "The memory subsystem for GPUs has many components. Some of which are global memory, the L2 cache, the L1 cache, and registers. The relative access speed for each is very dependent on the GPU, however, a common analogy can be used to describe their relative speeds:\n", + "\n", + " - Accessing registers is similar to already having the part in your hand\n", + " - Accessing L1 is like having the part in your pocket\n", + " - Accessing L2 is similar to having the part on your workbench\n", + " - Accessing global memory is like having the part in your toolbox in another room\n", + "\n", + "The higher you are in the memory hierarchy, the slower it is to access. However, at higher memory hierarchies, you typically have more space. Keeping with the analogy, since we can't keep that many parts in our hands at once, we need to strategically decide where to store our parts in order to increase our efficiency.\n", + "\n", + "In **Exercise 2.0**, what we've effectively done is read from global memory on each access, which would be like running to the other room everytime we needed a new part. In this exercise, we will strategically store data in L1 (the pockets in our analogy) and periodically fetch new data (or parts). CUDA allows us to read/write from the L1 cache through the use of shared memory. The exercise below will have us read from global memory in \"tiles\" and then do computations on tiles of the A/B matrices by reading from shared memory like we've shown below\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "c3c1e992-0f49-4605-9a25-23f30f58048b", + "metadata": {}, + "source": [ + "### C++" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9236fda1-193d-44a0-b25c-20c546c51d36", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1b/parameters.hpp.inc\n", + " // (gemm_m, gemm_n, gemm_k, alpha, beta)\n", + " std::vector problems = {\n", + " {2048, 2048, 2048, 0.9, 1.1}\n", + " };" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f25d13d-11ec-45ee-a3c9-01681edb30f0", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1b/kernel.hpp.inc\n", + "\n", + "template \n", + "struct tile_config {\n", + " static constexpr int m = BlockM;\n", + " static constexpr int n = BlockN;\n", + " static constexpr int k = BlockK;\n", + "\n", + " static constexpr int num_elems_a = m * k;\n", + " static constexpr int num_elems_b = k * n;\n", + "\n", + " static constexpr int max_threads_per_block = BlockM * BlockN;\n", + "\n", + " static_assert(m == n && m == k, \"This constraint is for simplicity, feel free to challenge yourself and complicate the config\");\n", + "};\n", + "\n", + "template\n", + "__launch_bounds__(TileConfig::max_threads_per_block, 1) __global__\n", + " void kernel_1b_simple_dgemm_shared(double alpha,\n", + " TensorA const tensor_a,\n", + " TensorB const tensor_b,\n", + " double beta,\n", + " TensorC const tensor_c) {\n", + " extern __shared__ __align__(sizeof(double)) unsigned char smem[];\n", + "\n", + " double* smem_a_data = reinterpret_cast(smem);\n", + " auto smem_a_tensor = tutorial::make_smem_tensor(smem_a_data);\n", + "\n", + " double* smem_b_data = tutorial::raw_pointer_cast(smem_a_tensor.data()) + TileConfig::num_elems_a;\n", + " auto smem_b_tensor = tutorial::make_smem_tensor(smem_b_data);\n", + "\n", + " // Assert that for A: mxk and B: kxn both Ks are the same size\n", + " auto const global_k = tutorial::size<1>(tensor_a);\n", + "\n", + " // Define accumulator storage\n", + " double accumulator = 0.0;\n", + "\n", + " int const idx_x = threadIdx.x;\n", + " int const idx_y = threadIdx.y;\n", + "\n", + " int const thread_row_idx = threadIdx.x + blockDim.x * blockIdx.x;\n", + " int const thread_col_idx = threadIdx.y + blockDim.y * blockIdx.y;\n", + "\n", + " // EXERCISE -> Iterate in tiles along the K dimension, store tiles of the A and B matrices into shored memory,\n", + " // and read from shared memory buffers when accumulating.\n", + " // \n", + " // Hints:\n", + " // - When do we need synchronize with (__syncthreads)?\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6da81e0b-ab39-42c8-9926-533be5b5ff71", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1b/kernel_parameters.hpp.inc\n", + "\n", + " constexpr int tile_m = 16;\n", + " constexpr int tile_n = 16;\n", + " constexpr int tile_k = 16;\n", + " using tile = tile_config;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a039803-ec09-4020-8612-e251d0b87b1d", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build build/ -t 1b_simple_dgemm_shared" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8923857d-72ed-48b7-9b16-cd3897881b98", + "metadata": {}, + "outputs": [], + "source": [ + "!./build/1b_simple_dgemm_shared" + ] + }, + { + "cell_type": "markdown", + "id": "c1e15134-6812-4dbf-9f11-34610d0696d0", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "markdown", + "id": "17ef2502-d0a1-4808-add9-8396b8615226", + "metadata": {}, + "source": [ + "We will rewrite kernel now and recompile the solution. If you want to restart your exercise make sure you rewrite kernel back and recompile it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18ce3886-b98c-4093-a3a1-0ed2847e4261", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1b/kernel.hpp.inc\n", + "\n", + "template \n", + "struct tile_config {\n", + " static constexpr int m = BlockM;\n", + " static constexpr int n = BlockN;\n", + " static constexpr int k = BlockK;\n", + "\n", + " static constexpr int num_elems_a = m * k;\n", + " static constexpr int num_elems_b = k * n;\n", + "\n", + " static constexpr int max_threads_per_block = BlockM * BlockN;\n", + "\n", + " static_assert(m == n && m == k, \"This constraint is for simplicity, feel free to challenge yourself and complicate the config\");\n", + "};\n", + "\n", + "template\n", + "__launch_bounds__(TileConfig::max_threads_per_block, 1) __global__\n", + " void kernel_1b_simple_dgemm_shared(double alpha,\n", + " TensorA const tensor_a,\n", + " TensorB const tensor_b,\n", + " double beta,\n", + " TensorC const tensor_c) {\n", + " extern __shared__ __align__(sizeof(double)) unsigned char smem[];\n", + "\n", + " double* smem_a_data = reinterpret_cast(smem);\n", + " auto smem_a_tensor = tutorial::make_smem_tensor(smem_a_data);\n", + "\n", + " double* smem_b_data = tutorial::raw_pointer_cast(smem_a_tensor.data()) + TileConfig::num_elems_a;\n", + " auto smem_b_tensor = tutorial::make_smem_tensor(smem_b_data);\n", + "\n", + " // Assert that for A: mxk and B: kxn both Ks are the same size\n", + " auto const global_k = tutorial::size<1>(tensor_a);\n", + "\n", + " // Define accumulator storage\n", + " double accumulator = 0.0;\n", + "\n", + " int const idx_x = threadIdx.x;\n", + " int const idx_y = threadIdx.y;\n", + "\n", + " int const thread_row_idx = threadIdx.x + blockDim.x * blockIdx.x;\n", + " int const thread_col_idx = threadIdx.y + blockDim.y * blockIdx.y;\n", + "\n", + " // EXERCISE -> Iterate in tiles along the K dimension, store tiles of the A and B matrices into shored memory,\n", + " // and read from shared memory buffers when accumulating.\n", + " // \n", + " // Hints:\n", + " // - When do we need synchronize with (__syncthreads)?\n", + " for (int tile_iter = 0; tile_iter < (global_k / TileConfig::k); ++tile_iter) {\n", + "\n", + " // Load current tile into shared memory\n", + " auto current_global_tile_a = cublasdx::get_tile(tensor_a, smem_a_tensor.shape(), blockIdx.x, tile_iter);\n", + " auto current_global_tile_b = cublasdx::get_tile(tensor_b, smem_b_tensor.shape(), tile_iter, blockIdx.y);\n", + "\n", + " __syncthreads();\n", + "\n", + " smem_a_tensor(idx_x, idx_y) = current_global_tile_a(idx_x, idx_y);\n", + " smem_b_tensor(idx_x, idx_y) = current_global_tile_b(idx_x, idx_y);\n", + "\n", + " __syncthreads();\n", + "\n", + " #pragma unroll\n", + " for (int i = 0; i < TileConfig::k; i++) {\n", + " accumulator += smem_a_tensor(idx_x, i) * smem_b_tensor(i, idx_y);\n", + " }\n", + " }\n", + "\n", + " double const c_elem = tensor_c(thread_row_idx, thread_col_idx);\n", + " double const result = alpha * accumulator + beta * c_elem;\n", + "\n", + " // Store results\n", + " tensor_c(thread_row_idx, thread_col_idx) = result;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b601a4f2-cfc7-4e04-a12c-08d3e6e1cd76", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build build/ -t 1b_simple_dgemm_shared" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e7234e3-0823-4711-9e7c-a17ce6807e4c", + "metadata": {}, + "outputs": [], + "source": [ + "!./build/1b_simple_dgemm_shared" + ] + }, + { + "cell_type": "markdown", + "id": "2898b77d-e169-45a6-9efd-e7b839034ef0", + "metadata": {}, + "source": [ + "### Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "661200e7-824b-4086-932a-3539fd1de5c1", + "metadata": {}, + "outputs": [], + "source": [ + "# The problems that we will benchmark and conduct accuracy tests on the tuple should be formed as:\n", + "# (GEMM_M, GEMM_N, GEMM_K, ALPHA, BETA)\n", + "problems = [\n", + " (2048, 2048, 2048, 1.0, 1.0),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5193a24-1a1d-4178-a54d-afe4f63b87c4", + "metadata": {}, + "outputs": [], + "source": [ + "def get_2_2_dgemm_kernel():\n", + " # For this kernel, we simplify to only 16x16x16 tile size.\n", + " # While it's possible to change tile sizes, it significantly complicates code\n", + " TILE_M = 16\n", + " TILE_N = 16\n", + " TILE_K = 16\n", + "\n", + " BLOCK_SIZE = 16 * 16\n", + " \n", + " @cuda.jit(launch_bounds=(BLOCK_SIZE, 1))\n", + " def dgemm_kernel(alpha, tensor_a, tensor_b, beta, tensor_c):\n", + " m, n = tensor_c.shape\n", + " _, k = tensor_a.shape\n", + "\n", + " smem_a_tensor = cuda.shared.array(shape=(TILE_M, TILE_K), dtype=np.float64)\n", + " smem_b_tensor = cuda.shared.array(shape=(TILE_K, TILE_N), dtype=np.float64)\n", + "\n", + " idx_x = cuda.threadIdx.x\n", + " idx_y = cuda.threadIdx.y\n", + " \n", + " thread_row_idx = idx_x + cuda.blockIdx.x * cuda.blockDim.x\n", + " thread_col_idx = idx_y + cuda.blockIdx.y * cuda.blockDim.y\n", + "\n", + " # EXERCISE -> Iterate in tiles along the K dimension, store tiles of the A and B matrices into shored memory,\n", + " # and read from shared memory buffers when accumulating.\n", + " # \n", + " # Hints:\n", + " # - When do we need synchronize with (cuda.syncthreads())?\n", + " \n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e929069e-d2f6-4319-a5e1-aaaf2db5f39e", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_dgemm_2_2(problems, get_2_2_dgemm_kernel)" + ] + }, + { + "cell_type": "markdown", + "id": "5eb10441-47dd-405d-999f-50045aaeb293", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38c440da-e5f6-4950-8964-835e38a9641e", + "metadata": {}, + "outputs": [], + "source": [ + "def get_2_2_dgemm_kernel_solution():\n", + " # For this kernel, we simplify to only 16x16x16 tile size.\n", + " # While it's possible to change tile sizes, it significantly complicates code\n", + " TILE_M = 16\n", + " TILE_N = 16\n", + " TILE_K = 16\n", + "\n", + " BLOCK_SIZE = 16 * 16\n", + " \n", + " @cuda.jit(launch_bounds=(BLOCK_SIZE, 1))\n", + " def dgemm_kernel(alpha, tensor_a, tensor_b, beta, tensor_c):\n", + " m, n = tensor_c.shape\n", + " _, k = tensor_a.shape\n", + "\n", + " smem_a_tensor = cuda.shared.array(shape=(TILE_M, TILE_K), dtype=np.float64)\n", + " smem_b_tensor = cuda.shared.array(shape=(TILE_K, TILE_N), dtype=np.float64)\n", + "\n", + " idx_x = cuda.threadIdx.x\n", + " idx_y = cuda.threadIdx.y\n", + " \n", + " thread_row_idx = idx_x + cuda.blockIdx.x * cuda.blockDim.x\n", + " thread_col_idx = idx_y + cuda.blockIdx.y * cuda.blockDim.y\n", + "\n", + " accumulator = 0.0\n", + "\n", + " # EXERCISE -> Iterate in tiles along the K dimension, store tiles of the A and B matrices into shored memory,\n", + " # and read from shared memory buffers when accumulating.\n", + " # \n", + " # Hints:\n", + " # - When do we need synchronize with (cuda.syncthreads())?\n", + " for tile_k_start in range(0, k, TILE_K):\n", + " smem_a_tensor[idx_x, idx_y] = tensor_a[thread_row_idx, tile_k_start + idx_y]\n", + " smem_b_tensor[idx_x, idx_y] = tensor_b[tile_k_start + idx_x, thread_col_idx]\n", + " \n", + " cuda.syncthreads()\n", + " \n", + " for i in range(0, TILE_K):\n", + " accumulator += smem_a_tensor[idx_x, i] * smem_b_tensor[i, idx_y]\n", + "\n", + " cuda.syncthreads()\n", + "\n", + " c_elem = tensor_c[thread_row_idx, thread_col_idx]\n", + " \n", + " tensor_c[thread_row_idx, thread_col_idx] = alpha * accumulator + beta * c_elem\n", + " \n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a534c61-ec4c-4226-8409-b4a277776254", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_dgemm_2_2(problems, get_2_2_dgemm_kernel_solution)" + ] + }, + { + "cell_type": "markdown", + "id": "e41d7476-7b69-4a07-bbf5-b02823603a4b", + "metadata": {}, + "source": [ + "### Analyzing Tiled GEMM Performance\n", + "\n", + "Let's modify our roofline model and consider the optimizations we've made." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a83fb757-b78b-465b-90b1-7bf768e74f08", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import math\n", + "\n", + "# TFLOPS, MEMORY BANDWIDTH (GB/s)\n", + "GPU_SPECS = {\n", + " \"L40S\": (1.43, 864),\n", + " \"B200\": (37, 6200)\n", + "}\n", + "\n", + "def roofline_prediction_2_2(m, n, k, TILE_M=16, TILE_N=16, TILE_K=16):\n", + " FP64_TFLOPS, MEMORY_BANDWIDTH_GBS = GPU_SPECS[\"L40S\"]\n", + "\n", + " # Let's instead \n", + "\n", + " # By design since each thread is computing one output element\n", + " tiles = math.ceil(m / TILE_M) * math.ceil(n / TILE_N)\n", + "\n", + " # Each tile does TILE_M * TILE_N dot products which each have k multiplications and k additions\n", + " flops_per_tile = 2 * TILE_M * TILE_N * k\n", + "\n", + " fp64_size = np.dtype(np.float64).itemsize\n", + "\n", + " # We load a TILE_M rows of matrix A, TILE_N columns of matrix B, and write to and read from TILE_M * TILE_N elements of matrix C\n", + " memory_per_tile = (TILE_M * k + TILE_N * k + 2 * TILE_M * TILE_N) * fp64_size\n", + "\n", + " total_memory_gb = tiles * memory_per_tile * 1e-9\n", + " total_tflop = tiles * flops_per_tile * 1e-12\n", + "\n", + " return total_tflop / FP64_TFLOPS, total_memory_gb / MEMORY_BANDWIDTH_GBS\n", + "\n", + "time_flops, time_membw = roofline_prediction_2_2(2048, 2048, 2048)\n", + "\n", + "print(f\"The runtime from the math operations {time_flops * 1e3} ms and the runtime from memory is {time_membw * 1e3} ms\")\n", + "\n", + "# We will either be bottlenecked by FLOPS or Memory Bandwidth, so we take the maximum\n", + "print(f\"Therefore, the estimated best case runtime is {max(time_flops, time_membw) * 1e3} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "d92ad335-9a59-4489-b626-f60e7499f49a", + "metadata": {}, + "source": [ + "Information regarding certain GPUs' capabilities can be found in official datasheets, you can see some examples here:\n", + "- [NVIDIA A100](https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-nvidia-us-2188504-web.pdf)\n", + "- [NVIDIA H200](https://resources.nvidia.com/en-us-hopper-architecture)\n", + "- [NVIDIA L40](https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/support-guide/NVIDIA-L40-Datasheet-January-2023.pdf)\n", + "- [NVIDIA L40s](https://images.nvidia.com/content/Solutions/data-center/vgpu-L40-datasheet.pdf)\n", + "- [NVIDIA RTX PRO 6000 Blackwell Server Edition](https://www.nvidia.com/content/dam/en-zz/Solutions/data-center/rtx-pro-6000-blackwell-workstation-edition/workstation-blackwell-rtx-pro-6000-workstation-edition-nvidia-us-3519208-web.pdf)\n", + "- [NVIDIA B200](https://resources.nvidia.com/en-us-blackwell-architecture)" + ] + }, + { + "cell_type": "markdown", + "id": "6c9e400f-6079-4572-94de-0ffa35f20d58", + "metadata": {}, + "source": [ + "## Exercise 2.3: Improving DGEMM with the cuBLASDx Pipeline API\n", + "\n", + "The roofline models we used, despite being an over-simplification, provide us a lot of insight. If we are close to the roofline, that means that we as optimized as we can be without fundamentally changing the algorithm (like when we introduced tiling). If we are far away from the roofline, then we know that looking for further optimizations could be worth it.\n", + "\n", + "Going forward, further optimizations follow a similar structure, where we would apply techniques to increase the tile size and implement similar tiling schemes at lower levels of the memory hierarchy. The algorithms grow more complicated and the source code length grows equally, if not more. Rather than continuing with these techniques, for the remaining exercises, we will leverage the advanced optimizations without needing to implement them by calling the cublasDx and nvmath-python libraries." + ] + }, + { + "cell_type": "markdown", + "id": "ffa895b0-efcf-47d8-b426-eab5dd4bff1c", + "metadata": {}, + "source": [ + "### What is GEMM pipelining\n", + "\n", + "MathDx operations are typically performed on the shared memory tile level just like we implemented in exercise 2.2. The typical procedure is:\n", + "\n", + "1. Load data into shared memory\n", + "2. Perform computations using shared memory\n", + "3. Store results\n", + "\n", + "In this approach kernels themselves are potentially compute-bound, but when one stage of compute is being performed another could already be in flight into shared memory, this is what we describe as **pipelined approach**. Without such overlapping, CUDA hardware latency hiding may not be enough to maximize memory bandwidth. This way the computational units (Tensor Cores) can be operating at maximal occupancy all the time, without ever stalling while waiting for next stage. An advanced extension of pipelining is the producer-consumer model, where separate threads are responsible for loading data into buffers and separate for computing results of the loaded elements.\n", + "\n", + "**cuBLASDx pipeline API** exposes this pipelined logic with a simple interface, still allowing for fusion at all levels of hierarchy, either before computation or after.\n", + "\n", + "![Pipeline](images/pipeline.png)" + ] + }, + { + "cell_type": "markdown", + "id": "d1d954fe-0d0b-4592-b023-d4563dc4f3f3", + "metadata": {}, + "source": [ + "**How to find optimal bytes-in-flight (tile size and pipeline depth)?**\n", + "\n", + "**Little’s law** in queuing theory states that the average number of items $L$ in a stable system equals the product of the arrival (or service) throughput $\\lambda$ and the average time $W$ that an item spends in the system, i.e., $ L = \\lambda W $\n", + "\n", + "In the context of GPU/CUDA memory systems, the “items” can be interpreted as data units, such as bytes or cache lines moving through the memory hierarchy. The throughput $\\lambda$ is then the sustained memory bandwidth in bytes per second, and the time $W$ corresponds to the effective memory access latency in seconds.\n", + "\n", + "Combining these interpretations yields the following approximation:\n", + "$$\n", + "\\text{bytes in flight} \\approx \\text{bandwidth} \\times \\text{latency}.\n", + "$$\n", + "\n", + "This expresses that the amount of data concurrently outstanding in the memory system must be large enough to match the product of the achievable bandwidth and the latency.\n", + "\n", + "Practically, this means that when global-memory latency is high, a CUDA kernel must generate many independent memory requests so that enough bytes are in flight to hide latency and keep DRAM bandwidth saturated.\n", + "\n", + "More information about this can be found in the [CUDA Techniques to maximize memory bandwidth and hide latency](https://www.nvidia.com/en-us/on-demand/session/gtc25-s72683/) GTC presentation." + ] + }, + { + "cell_type": "markdown", + "id": "62c2cef2-d6ca-4951-8a08-2ed38e6b3345", + "metadata": {}, + "source": [ + "### MathDx and cuBLASDx\n", + "\n", + "The [cuBLAS Device Extension](https://docs.nvidia.com/cuda/cublasdx/) library (**cuBLASDx**) gives kernel developers the flexibility to define GEMM operations in terms of shared memory tiles and compose these operations into their kernels. cuBLASDx is a part of MathDx, a **Device Extension** library suite, also containing:\n", + "- cuSolverDx, for numerical solvers\n", + "- cuFFTDx, for thread and block FFTs\n", + "- cuRANDDx, for random number generation\n", + "- nvCOMPDx, for data compression\n", + "\n", + "MathDx exposes functionality on all CUDA memory levels, ranging from global memory pipelines, through shared memory tile computations to per-thread in-register algorithms." + ] + }, + { + "cell_type": "markdown", + "id": "0a9004db-c2f0-43fc-b91c-03acb2825b4f", + "metadata": {}, + "source": [ + "### GEMM kernel with cuBLASDx Pipeline API\n", + "\n", + "Some CUDA instructions require significant orchestration around them to function properly. This strongly relates to later NVIDIA Architectures (Hopper, Blackwell) and respective capabilities (`TMA`, `WGMMA`, `UTCMMA`). \n", + "\n", + "Kernel patterns used underneath to allow for greater overlap also do change, including producer-consumer waprgroups, barrier based multi-stage pipelines and decoupled epilogues. \n", + "\n", + "With time the amount of complexity in this surrounding logic and orchestration proved to be as big as cuBLASDx's per-tile `copy` and `execute` operations. The library's core goal is to allow for fusability of external operations while still allowing for best performance math primitive execution and thus `cuBLASDx Pipeline API` was created, exposing the entire GEMM pipeline with a one line call exposing the result of GEMM in registers and allowing to fuse pre-processing operations.\n", + "\n", + "cuBLASDx documentation offers a [short guide on using Pipeline API](https://docs.nvidia.com/cuda/cublasdx/using_pipelines.html) for GEMM computations." + ] + }, + { + "cell_type": "markdown", + "id": "f5e03607-0487-4b9f-850e-57c95b34486e", + "metadata": {}, + "source": [ + "### C++" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d6f4554-eb6a-4e32-97af-425ca075979c", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1c/parameters.hpp.inc\n", + "\n", + " // (gemm_m, gemm_n, gemm_k, alpha, beta)\n", + " std::vector problems = {\n", + " {2048, 2048, 2048, 0.9, 1.1}\n", + " };" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d06e039-1311-41d8-8d9d-797012ba07ce", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1c/cublasdx_config.hpp.inc\n", + "\n", + "constexpr int tile_m = 64;\n", + "constexpr int tile_n = 64;\n", + "constexpr int tile_k = 32;\n", + "\n", + "constexpr int block_dim = 256;\n", + "\n", + "// The first step is to define the tile-level GEMM operation to be performed. \n", + "// This is accomplished by combining cuBLASDx operators to create a GEMM description.\n", + "\n", + "using BLAS =\n", + " decltype(cublasdx::Size() + // Description: Shared Memory GEMM Size\n", + " cublasdx::Precision() + // Description: Input Precisions\n", + " cublasdx::Type() + // Description: Input number type (real / complex)\n", + " cublasdx::Function() + // Description: BLAS function (MM - Matrix Multiplication)\n", + " cublasdx::Arrangement() + // Description: Global Memory arrangement (row- or column-major)\n", + " cublasdx::Block() + // Execution: per-tile operation level (CUDA threadblock)\n", + " cublasdx::BlockDim() + // Execution: CUDA threadblock size (1D, 2D or 3D) \n", + " cublasdx::StaticBlockDim() + // Performance: this kernel will not use more threads than specified\n", + " cublasdx::MaxAlignment() + // Performance: global and shared memory alignment is >= 16bytes\n", + " cublasdx::EnableInputStreaming() + // Performance: no per-element preprocessing needs to be used\n", + " cublasdx::SM() + // Execution: run on SM (e.g. 89) with modifier (e.g. 89a)\n", + " cublasdx::WithPipeline()); // Execution: this per-tile descriptor will be only used with pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59ba6700-8fa4-4dbc-a631-265a8e4ba990", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1c/pipeline_config.hpp.inc\n", + "\n", + " // IMPORTANT: The pipeline description needs to be defined on host,\n", + " // because possible TMA initialization must happen through a driver call\n", + "\n", + " // Pipeline depth discussed in a section above\n", + " constexpr int pipeline_depth = 2;\n", + " // cuBLASDx will return a std::optional, depending on correctness of arguments\n", + " auto opt_device_pipeline = cublasdx::suggest_device_pipeline(tensor_a, tensor_b);\n", + "\n", + " if (not opt_device_pipeline) {\n", + " std::cout << \"Incorrect pipeline configuration, please ensure global tensors are divisible by tile\"\n", + " << std::endl;\n", + " exit(1);\n", + " }\n", + " // The pipeline can be retrieved now\n", + " auto device_pipeline = opt_device_pipeline.value();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d0c2b74-0ba3-4743-8a5a-807a99d59f69", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1c/kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__\n", + " void kernel_1c_simple_pipelined_dgemm(double alpha,\n", + " double beta,\n", + " TensorC const tensor_c,\n", + " // IMPORTANT --> grid constant\n", + " __grid_constant__ DevicePipeline const device_pipeline) {\n", + " extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[];\n", + "\n", + " auto tile_pipeline = device_pipeline.get_tile(smem, blockIdx.x, blockIdx.y);\n", + " auto tile_gmem_c = cublasdx::get_tile(EXERCISE);\n", + "\n", + " auto epilogue_functor = [&](auto& accumulator) {\n", + " // EXERCISE --> implement GEMM epilogue (C = alpha * D + beta * C)\n", + " // Possible approaches:\n", + " // - manually (axpby for loop)\n", + " // - cublasdx::axpby(alpha, fragment, beta, fragment);\n", + " // - accumulator.axpby(alpha, beta, gmem_tile)\n", + "\n", + " // The following calls may or may not be necessary depending on chosen implementation\n", + " // auto register_result_tensor = accumulator.get_results();\n", + " // auto c_register_fragment = accumulator.make_partition_and_copy(tile_gmem_c);\n", + " // accumulator.partition_and_copy(TODO, tile_gmem_c);\n", + " };\n", + "\n", + " tile_pipeline.execute(epilogue_functor);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2f2a6f8-3280-4d3b-a9e4-273a9ffd3cca", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1c/kernel_config.hpp.inc\n", + "\n", + " auto kernel = kernel_1c_simple_pipelined_dgemm;\n", + " // Pipeline exposes pre-computed shared memory requirement that includes its own cache size\n", + " auto shared_memory_size = device_pipeline.buffer_size();\n", + " CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb71d9f3-470f-49ca-892d-33534f606877", + "metadata": {}, + "outputs": [], + "source": [ + "! cmake --build ./build -t 1c_simple_pipelined_dgemm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3163dc7-b3ed-4626-a63d-693a8b0a9f17", + "metadata": {}, + "outputs": [], + "source": [ + "! ./build/1c_simple_pipelined_dgemm" + ] + }, + { + "cell_type": "markdown", + "id": "a17cc39e-1647-4702-bef5-9c466b204a37", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "markdown", + "id": "5a730d4b-70f1-488b-b8df-74fb5b391872", + "metadata": {}, + "source": [ + "We will rewrite kernel now and recompile the solution. If you want to restart your exercise make sure you rewrite kernel back and recompile it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fad9dfb2-8f00-4cc3-a7f2-4add1bc64620", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/1c/kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__\n", + " void kernel_1c_simple_pipelined_dgemm(double alpha,\n", + " double beta,\n", + " TensorC const tensor_c,\n", + " // IMPORTANT --> grid constant\n", + " __grid_constant__ DevicePipeline const device_pipeline) {\n", + " extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[];\n", + "\n", + " auto tile_pipeline = device_pipeline.get_tile(smem, blockIdx.x, blockIdx.y);\n", + " auto tile_gmem_c = cublasdx::get_tile(tensor_c, BLAS::c_shape, blockIdx.x, blockIdx.y);\n", + "\n", + " auto epilogue_functor = [&](auto& accumulator) {\n", + " accumulator.axpby(alpha, beta, tile_gmem_c);\n", + " };\n", + "\n", + " tile_pipeline.execute(epilogue_functor);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2211bfa4-d93e-4a56-9379-7b0c5b7f378f", + "metadata": {}, + "outputs": [], + "source": [ + "! cmake --build ./build -t 1c_simple_pipelined_dgemm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3606b327-3451-49ae-94ba-3032ff09a42a", + "metadata": {}, + "outputs": [], + "source": [ + "! ./build/1c_simple_pipelined_dgemm" + ] + }, + { + "cell_type": "markdown", + "id": "2b7e349e-9bc3-4a4f-8dcc-2d31fb27a39d", + "metadata": {}, + "source": [ + "### Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ff0a133-c73e-4ca6-a04c-9aade04a0f01", + "metadata": {}, + "outputs": [], + "source": [ + "# The problems that we will benchmark and conduct accuracy tests on the tuple should be formed as:\n", + "# (GEMM_M, GEMM_N, GEMM_K, ALPHA, BETA)\n", + "problems = [\n", + " (2048, 2048, 2048, 1.0, 1.0),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5af23c55-d728-4a16-bdfe-75e0620a3770", + "metadata": {}, + "outputs": [], + "source": [ + "def choose_kernel_params_2_3(m, n, k, alpha, beta):\n", + " TILE_M = 64\n", + " TILE_N = 64\n", + " TILE_K = 32\n", + "\n", + " BLOCK_SIZE = 256\n", + " \n", + " # The first step is to define the tile-level GEMM operation to be performed. \n", + " # This is accomplished by combining cuBLASDx operators to create a GEMM description.\n", + "\n", + " return Matmul( \n", + " size=(TILE_M, TILE_N, TILE_K), # Description: Shared Memory GEMM Size\n", + " precision=(np.float64, np.float64, np.float64), # Description: Input Precisions\n", + " data_type=\"real\", # Description: Input number type (real / complex)\n", + " alignment=MAX_ALIGNMENT, # Performance: global and shared memory alignment\n", + " arrangement=(\"row_major\", \"col_major\", \"col_major\"), # Description: Global Memory arrangement (row- or column-major)\n", + " execution=\"Block\", # Execution: per-tile operation level (CUDA threadblock)\n", + " block_size=BLOCK_SIZE, # Execution: CUDA threadblock size (1D, 2D or 3D) \n", + " with_pipeline=True, # Execution: this per-tile descriptor will be only used with pipeline\n", + " enable_input_streaming=True, # Performance: no per-element preprocessing needs to be used\n", + " static_block_dim=True, # Performance: this kernel will not use more threads than specified\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b72bcb-9265-4fc3-a54f-0bf580ac1178", + "metadata": {}, + "outputs": [], + "source": [ + "def get_kernel_args_2_3(BLAS, alpha, tensor_a, tensor_b, beta, tensor_c):\n", + " # IMPORTANT: The pipeline description needs to be defined on host,\n", + " # because possible TMA initialization must happen through a driver call\n", + "\n", + " # Pipeline depth discussed in a section above\n", + " PIPELINE_DEPTH = 2\n", + "\n", + " TILE_K = BLAS.a_dim[1]\n", + " _, k = tensor_a.shape\n", + "\n", + " assert k >= PIPELINE_DEPTH * TILE_K, \"The user provided value for K is too small for the pipeline depth\"\n", + " \n", + " device_pipeline = BLAS.suggest_device_pipeline(PIPELINE_DEPTH, tensor_a, tensor_b)\n", + " return alpha, beta, tensor_c, device_pipeline\n", + "\n", + "def get_shared_memory_size_2_3(BLAS, kernel_args):\n", + " device_pipeline = kernel_args[-1]\n", + " # Pipeline exposes pre-computed shared memory requirement that includes its own cache size\n", + " return device_pipeline.buffer_size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe1543db-5a59-4388-887c-bb84d27cab67", + "metadata": {}, + "outputs": [], + "source": [ + "def get_dgemm_kernel_2_3(BLAS):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " tile_m, tile_n = BLAS.c_dim\n", + " \n", + " @cuda.jit(extensions=pipeline_extensions, launch_bounds=(BLAS.block_size, 1))\n", + " def dgemm_kernel(alpha, beta, tensor_c, device_pipeline: DevicePipeline):\n", + " m, n = tensor_c.shape\n", + "\n", + " ldc = max(tensor_c.strides) // tensor_c.itemsize\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + "\n", + " smem = cuda.shared.array(shape=(0,), dtype=BLAS.a_value_type, alignment=device_pipeline.buffer_alignment)\n", + "\n", + " block_start_m = block_m * tile_m\n", + " block_end_m = (block_m + 1) * tile_m\n", + "\n", + " block_start_n = block_n * tile_n\n", + " block_end_n = (block_n + 1) * tile_n\n", + "\n", + " if block_start_m >= m or block_start_n >= n:\n", + " return\n", + " \n", + " c_view = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " ]\n", + "\n", + " gmem_c = make_tensor(c_view, BLAS.get_layout_gmem_c(ldc))\n", + " \n", + " tile_pipeline = device_pipeline.get_tile(smem, block_m, block_n)\n", + " \n", + " accumulator = BLAS.suggest_accumulator()\n", + " tile_pipeline.execute(accumulator)\n", + "\n", + " #if accumulator.is_thread_active():\n", + " # EXERCISE --> implement GEMM epilogue (C = alpha * D + beta * C)\n", + " # Possible approaches:\n", + " # - manually (axpby for loop)\n", + " # - axpby(alpha, fragment, beta, fragment)\n", + "\n", + " # The following calls may or may not be necessary depending on chosen implementation\n", + " # register_result_tensor = accumulator.get_results();\n", + " # c_register_fragment = accumulator.make_partition_and_copy(tile_gmem_c)\n", + " # accumulator.partition_and_copy(TODO, tile_gmem_c)\n", + " \n", + " tile_pipeline._del()\n", + "\n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2afaf638-ef91-40aa-984b-0910803f876f", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_dgemm_2_3(problems, get_dgemm_kernel_2_3, choose_kernel_params_2_3, get_shared_memory_size_2_3, get_kernel_args_2_3)" + ] + }, + { + "cell_type": "markdown", + "id": "ed664f54-736e-4ed7-bfcc-1ed534aa3aae", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a548188-d9e2-4c31-90ce-0344ec656075", + "metadata": {}, + "outputs": [], + "source": [ + "def get_dgemm_kernel_2_3_solution(BLAS):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " tile_m, tile_n = BLAS.c_dim\n", + " \n", + " @cuda.jit(extensions=pipeline_extensions, launch_bounds=(BLAS.block_size, 1))\n", + " def dgemm_kernel(alpha, beta, tensor_c, device_pipeline: DevicePipeline):\n", + " m, n = tensor_c.shape\n", + "\n", + " ldc = max(tensor_c.strides) // tensor_c.itemsize\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + "\n", + " smem = cuda.shared.array(shape=(0,), dtype=BLAS.c_value_type, alignment=device_pipeline.buffer_alignment)\n", + "\n", + " block_start_m = block_m * tile_m\n", + " block_end_m = (block_m + 1) * tile_m\n", + "\n", + " block_start_n = block_n * tile_n\n", + " block_end_n = (block_n + 1) * tile_n\n", + "\n", + " if block_start_m >= m or block_start_n >= n:\n", + " return\n", + " \n", + " c_view = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " ]\n", + "\n", + " gmem_c = make_tensor(c_view, BLAS.get_layout_gmem_c(ldc))\n", + " \n", + " tile_pipeline = device_pipeline.get_tile(smem, block_m, block_n)\n", + " \n", + " accumulator = BLAS.suggest_accumulator()\n", + " tile_pipeline.execute(accumulator)\n", + "\n", + " if accumulator.is_thread_active():\n", + " c_frag = accumulator.make_partition_and_copy(gmem_c)\n", + " axpby(alpha, accumulator.get_results(), beta, c_frag)\n", + " accumulator.partition_and_copy(c_frag, gmem_c)\n", + "\n", + " tile_pipeline._del()\n", + "\n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89ad1d70-e9fe-4fb1-aaea-667c9ef8e52b", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_dgemm_2_3(problems, get_dgemm_kernel_2_3_solution, choose_kernel_params_2_3, get_shared_memory_size_2_3, get_kernel_args_2_3)" + ] + }, + { + "cell_type": "markdown", + "id": "ad79e622-f14a-4043-bd30-974c06b2fe2d", + "metadata": {}, + "source": [ + "### Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "8eb28434-a0ab-4cb4-b448-780a516c586d", + "metadata": {}, + "source": [ + "In this notebook, we have learned:\n", + "\n", + "1. What a GEMM is and how to implement a naive GEMM kernel\n", + "2. How to analyze our implementations and make intelligent choices about what to optimize next\n", + "3. What shared memory is and why it is critical for performance\n", + "\n", + "and then we have progressed to offloading all these aspects onto the cuBLASDx pipeline API, understanding:\n", + "1. What is pipelining and when is it necessary\n", + "2. How to define cuBLASDx per-tile description and make a pipeline based on it\n", + "3. How to do in-kernel epilogue fusion with pipelines" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/02.02-MatmulWithcuBLASDx.ipynb b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/02.02-MatmulWithcuBLASDx.ipynb new file mode 100644 index 00000000..31b1a34e --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/02.02-MatmulWithcuBLASDx.ipynb @@ -0,0 +1,948 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "45494e75-0bb2-4116-a92e-72bd2d5466f6", + "metadata": {}, + "source": [ + "# Matrix Multiplication Fundamentals\n", + "\n", + "A GEMM (General Matrix Multiply) operation takes the form $C = \\alpha \\mathbf{A}\\mathbf{B} + \\beta\\mathbf{C}$ where $\\alpha, \\beta$ are scalars, $\\mathbf{A}$ is an $m \\times k$ matrix, $\\mathbf{B}$ is a $k \\times n$ matrix, and $\\mathbf{C}$ is a $m \\times n$ matrix.\n", + "\n", + "The element at row $i$ and column $j$ of matrix $\\mathbf{C}$ is calculated as the scaled and biased dot product of row $i$ of $\\mathbf{A}$ and column $j$ of $\\mathbf{B}$ as follows:\n", + "\n", + "$$\n", + "\\mathbf{C}_{i, j} = \\alpha \\left(\\sum_{l=0}^{k} \\mathbf{A}_{i, l} \\mathbf{B}_{l, j} \\right) + \\beta \\mathbf{C}_{i, j}\n", + "$$\n", + "\n", + "In implementation the above operation is usually split into 2 parts:\n", + "1. Matrix Multiplication itself, computing $ \\mathbf{D}_{i, j} = \\sum_{l=0}^{k} \\mathbf{A}_{i, l} \\mathbf{B}_{l, j} $\n", + "2. Epilogue, computing $ \\mathbf{C}_{i, j} = \\alpha \\cdot \\mathbf{D}_{i, j} + \\beta \\cdot \\mathbf{C}_{i, j} $" + ] + }, + { + "cell_type": "markdown", + "id": "6dce8553-5f92-4568-8e0c-d05bb80d293b", + "metadata": {}, + "source": [ + "### Exercise Setup" + ] + }, + { + "cell_type": "markdown", + "id": "e04244e3-5c73-4c8f-a5f2-b2dc03cf3d81", + "metadata": {}, + "source": [ + "#### C++ CMake setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66bc1b3e-d872-4de9-bdab-eb0ec22ee2ad", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "import sys, os\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "from common_cuda import setup_cmake_project\n", + "setup_cmake_project()" + ] + }, + { + "cell_type": "markdown", + "id": "23299eb4-fed2-451b-815d-f4f098e99a01", + "metadata": {}, + "source": [ + "#### Python Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33ac6fc0-95d5-4c3a-9d30-000d3d4a3d92", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "import numpy as np\n", + "import cupy as cp\n", + "import nvmath\n", + "\n", + "from nvmath.device import Matmul\n", + "from nvmath.device.cublasdx import DevicePipeline, SharedStorageCalc, MAX_ALIGNMENT\n", + "from nvmath.device.cublasdx_numba import pipeline_extensions\n", + "from nvmath.device.common import axpby, clear, copy, copy_fragment, copy_wait, make_tensor\n", + "from numba import cuda\n", + "\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "\n", + "from benchmark import *" + ] + }, + { + "cell_type": "markdown", + "id": "a4cd8594-9e87-406a-92c9-2386aaef7168", + "metadata": {}, + "source": [ + "## Challenge Exercise 2.4: cuBLASDx\n", + "\n", + "In this exercise, we will convert previous custom tiled GEMM logic to cuBLASDx for both BLAS computations and data movement" + ] + }, + { + "cell_type": "markdown", + "id": "410fc937-0bda-492c-a413-f00288dd9ba8", + "metadata": {}, + "source": [ + "![cublasdx](images/cublasdx.png)" + ] + }, + { + "cell_type": "markdown", + "id": "9f0b98c8-360e-409b-b151-7a97428c6715", + "metadata": {}, + "source": [ + "### cuBLASDx shared memory tile kernel" + ] + }, + { + "cell_type": "markdown", + "id": "22e6fff8-aea5-4fb0-bf93-3cf4e8cd265f", + "metadata": {}, + "source": [ + "The following kernel reimplements the one from previous notebook using `cuBLASDx` for data movement and computation. How does this help us in improving the kernel? \n", + "\n", + "While we were able to increase data reuse on shared memory tile level, our code still missed several key features of a proper GEMM kernel:\n", + "1. Use of MMA instructions, allowing for higher FLOP/s as well as better data reuse (on warp, warpgroup, block and cluster level)\n", + "2. Use of async data copies to overlap computation with background data loading, on global, shared and register level\n", + "3. Data load vectorization, operating on multiple elements at the same time\n", + "4. Shared memory use without size increase or bank conflicts\n", + "\n", + "All these are provided by `cuBLASDx` together with multiple utilities allowing to keep the code performant, portable and tiny in size. \n", + "\n", + "cuBLASDx documentation offers a [short guide](https://docs.nvidia.com/cuda/cublasdx/using_cublasdx.html) on using library's functionality.\n", + "\n", + "![device_gemm](images/device_gemm.svg)" + ] + }, + { + "cell_type": "markdown", + "id": "e04331a5-05b8-4e44-939a-99c47799ddfa", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### cuBLASDx C++ Guides\n", + "\n", + "It's best to use these guides as they will become necessary in exercise, instead of trying to remember all the details at once." + ] + }, + { + "cell_type": "markdown", + "id": "ccf260fe-86f4-425a-b938-2cfbe7e23f71", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### cuBLASDx guide: Data Layouts" + ] + }, + { + "cell_type": "markdown", + "id": "d7946aba-70a2-408c-b804-e12859863e2f", + "metadata": {}, + "source": [ + "The way that data is laid out in shared memory is very important for GEMM performance, it allows or limits vectorization and shared memory bank conflicts. This arrangement of elements is called `layout` in `cuBLASDx` and is a first class element computed by the library for you. It can be accessed with:\n", + "\n", + "```\n", + "auto default_layout_a = BLAS::get_layout_smem_a();\n", + "```\n", + "\n", + "More information regarding default memory layouts can be found [here](https://docs.nvidia.com/cuda/cublasdx/api/other_methods.html#get-memory-layout). \n", + "\n", + "\n", + "for regular `row-` or `column-major` layout (as described with the `cublasdx::Arrangement<...>()` operator) or:\n", + "\n", + "```\n", + "auto optimal_layout_a = BLAS::suggest_layout_smem_a();\n", + "```\n", + "\n", + "which computes a layout swizzled for maximum vectorization, removal of shared memory bank conflicts and enablement of instructions such as `ld.matrix` (`LDSM`). You can find out more about suggested layout [here](https://docs.nvidia.com/cuda/cublasdx/api/other_methods.html#suggested-shared-memory-layout)\n", + "\n", + "Such layouts can be used to later create tensors from them by combining with a data pointer:\n", + "\n", + "```\n", + "auto tensor_a = cublasdx::make_tensor(data_pointer, BLAS::suggest_layout_smem_a());\n", + "\n", + "// elements are accesses with parentheses operator\n", + "auto elem_0_0 = tensor_a(row_index, col_index);\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "564d2883-9def-4063-b357-077442194494", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### cuBLASDx guide: Slicing " + ] + }, + { + "cell_type": "markdown", + "id": "5cf95004-e844-4c2f-8f34-fa48113c81da", + "metadata": {}, + "source": [ + "Manually moving pointers is tedious and error prone so cuBLASDx exposes `pointer slicing API` allowing to do it automatically for you.\n", + "\n", + "Slicing can be performed to get pointers:\n", + "\n", + "```\n", + "auto [ptr_a, ptr_b, ptr_c] = cublasdx::slice_into_pointer(start_pointer, \n", + " alignment_a, layout_a,\n", + " alignment_b, layout_b,\n", + " alignment_c, num_elems_c);\n", + "```\n", + "\n", + "or to get pointers and tensors:\n", + "\n", + "```\n", + "auto [tensor_a, tensor_b, ptr_c] = cublasdx::slice(start_pointer, \n", + " alignment_a, layout_a,\n", + " alignment_b, layout_b,\n", + " alignment_c, num_elems_c);\n", + "```\n", + "\n", + "Detailed reference of slicing function and their use can be found in the [documentation](https://docs.nvidia.com/cuda/cublasdx/api/other_shared.html#shared-memory-slicing)." + ] + }, + { + "cell_type": "markdown", + "id": "71e26891-7774-4ac7-a5c9-0ef290556a21", + "metadata": {}, + "source": [ + "##### cuBLASDx guide: Shared Memory Copying " + ] + }, + { + "cell_type": "markdown", + "id": "88ee670f-bf7c-4a2a-9cac-14418f24a3d1", + "metadata": {}, + "source": [ + "Moving data between global and shared memory is an important and complicated topic. The layout of tile data in shared memory must take under account:\n", + "1. Pattern for best global memory read\n", + "2. Pattern for best shared memory store\n", + "3. Pattern for best compute load\n", + "4. Pattern for best compute store\n", + "\n", + "All these steps can be recomposed into 2 elements:\n", + "- A data memory layout (as described in `Data Layouts`)\n", + "- Heuristic for combining `source layout` and `destination layout` into an algorithm maximizing achieved bandwidth.\n", + "\n", + "The latter part is provided by `cublasdx::copy`:\n", + "\n", + "```\n", + "// Copy from global to shared using BLAS BlockDim config\n", + "using alignment = cublasdx::alignment_of;\n", + "cublasdx::copy(gmem_tensor_a, smem_tensor_a);\n", + "cublasdx::copy(gmem_tensor_b, smem_tensor_b);\n", + "cublasdx::copy_wait();\n", + "\n", + "// Copy from shared to global using 128 threads\n", + "cublasdx::copy<128, alignment::a>(smem_tensor_a, gmem_tensor_a);\n", + "cublasdx::copy_wait();\n", + "```\n", + "\n", + "The copies are async by default so they can be overlapped with other operations, sync point is forced with `cublasdx::copy_wait` which will wait on all previous copies from the entire threadblock.\n", + "\n", + "A detailed reference of all copying overloads and functions can be found in the [documentation](https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#cooperative-global-shared-copying)" + ] + }, + { + "cell_type": "markdown", + "id": "58ce7bf4-72cb-4fef-8dbc-98e750a16061", + "metadata": {}, + "source": [ + "##### cuBLASDx guide: Accumulators and results" + ] + }, + { + "cell_type": "markdown", + "id": "1b09bb24-fcbe-4c98-a222-ba0220b253d4", + "metadata": {}, + "source": [ + "cuBLASDx `execute(...)` exposes several APIs to be chosen from by the user:\n", + "\n", + "```\n", + "#1 Shared memory API with optional pre- and postprocessing lambdas\n", + "BLAS().execute(alpha, tensor_a, tensor_b, beta, tensor_c, \n", + " [a_load_functor, b_load_functor, c_load_functor, c_store_functor]);\n", + "\n", + "#2 Register API without accumulation with optional preprocessing lambdas\n", + "auto accumulator = BLAS().execute(tensor_a, tensor_b,\n", + " [a_load_functor, b_load_functor]);\n", + "\n", + "#3 Register API with accumulation with optional preprocessing lambdas\n", + "BLAS().execute(tensor_a, tensor_b, accumulator,\n", + " [a_load_functor, b_load_functor]);\n", + "```\n", + "\n", + "accumulator is a collection of per-thread C elements with associated execution properties. It exposes APIs such as:\n", + "\n", + "```\n", + "// Retrieve register tensor with results\n", + "auto res = accumulator.get_results();\n", + "\n", + "// Does this thread own some elements of C\n", + "bool res = accumulator.is_thread_active();\n", + "\n", + "// Are there extra zero-elements owned by some threads\n", + "bool res = accumulator.is_predicated();\n", + "\n", + "// shared_tensor_c = alpha * accumulator\n", + "accumulator.axpby(alpha, beta, shared_tensor_c);\n", + "```\n", + "\n", + "[cuBLASDx accumulator documentation](https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#accumulator-and-register-fragment-tensors) provides a detailed description of general accumulator functionality as well as [copying functions](https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#copying-registers-tensors)" + ] + }, + { + "cell_type": "markdown", + "id": "60406710-5c6b-4fe6-b707-c475242338d8", + "metadata": {}, + "source": [ + "##### Data partitioning and mapping" + ] + }, + { + "cell_type": "markdown", + "id": "83808654-7baf-4285-859a-0a4185e089d8", + "metadata": {}, + "source": [ + "In GEMMs we are often decomposing bigger problems into smaller subproblems and offsetting pointers manually is error prone. Rich tensor types allow doing this automatically on multiple levels. \n", + "\n", + "`cublasdx::slice` is a slice value allowing to keep entire dimension in the resulting view.\n", + "\n", + "1. Dividing global memory tensor view into tiles and choosing entire row of tiles:\n", + "```\n", + "auto global_tile_row_a = cublasdx::get_tile_row(tensor, BLAS::a_shape, tile_row_index);\n", + "\n", + "// How to access:\n", + "auto single_tile = global_tile_row_a(cublasdx::slice, cublasdx::slice, tile_col_index);\n", + "```\n", + "\n", + "2. Dividing global memory tensor view into tiles and choosing entire column of tiles:\n", + "```\n", + "auto global_tile_col_b = cublasdx::get_tile_row(tensor, BLAS::b_shape, tile_col_index);\n", + "\n", + "// How to access:\n", + "auto single_tile = global_tile_row_a(cublasdx::slice, cublasdx::slice, tile_row_index);\n", + "```\n", + "\n", + "3. Dividing global memory tensor view into tiles and choosing a single one:\n", + "```\n", + "auto global_tile_c = cublasdx::get_tile(tensor, BLAS::b_shape, tile_row_index, tile_col_index);\n", + "```\n", + "\n", + "apart from choosing tiles, it's important to map thread register result values to their appropriate locations inside the tile. This is allowed by `accumulator` APIs:\n", + "\n", + "```\n", + "// If this thread takes part in GEMM\n", + "if(accumulator.is_thread_active()) {\n", + " // For each element of register fragment\n", + " for(int i = 0; i < cublasdx::size(d_register_fragment); ++i) {\n", + " auto [tile_index_x, tile_index_y] = accumulator.map_fragment_index(i);\n", + " if((not accumulator.is_predicated()) or accumulator.is_index_in_bounds(i)) {\n", + " // Copy respective global element into it\n", + " d_register_fragment(i) = load_op(c_global_tensor(tile_index_x, tile_index_y));\n", + " }\n", + " }\n", + "}\n", + "```\n", + "\n", + "a per-thread view of a tile tensor can also be created:\n", + "```\n", + "auto global_c_thread_view = accumulator.partition_like_C(global_tile_c);\n", + "```\n", + "\n", + "Multiple functionalities have been combined with partinioning to allow for a terse and simple code:\n", + "\n", + "```\n", + "// Create empty fragment, partition tensor and load appropriate elements safely\n", + "auto loaded_c_register_fragment = accumulator.make_partition_and_copy();\n", + "// Partition tensor and store appropriate result elements safely\n", + "accumulator.partition_and_store(tile_global_c);\n", + "// Partition tensor and perform axpby on appropriate elements with results\n", + "accumulator.axpby(alpha, beta, tile_global_c);\n", + "// Store values from some_fragment to partitioned tensor\n", + "accumulator.partition_and_copy(some_fragment, tile_global_c);\n", + "// Load values from partitioned tensor to some_fragment\n", + "accumulator.partition_and_copy(tile_global_c, some_fragment);\n", + "```\n", + "\n", + "More examples concerning slicing and partitioning can be found in [pipeline documentation](https://docs.nvidia.com/cuda/cublasdx/using_pipelines.html#executing-pipelined-gemm) as well as [accumulator documentation](https://docs.nvidia.com/cuda/cublasdx/api/other_tensors.html#accumulator-and-register-fragment-tensors)" + ] + }, + { + "cell_type": "markdown", + "id": "e3c9b791-45e5-48e5-a107-a05e8a5b4a76", + "metadata": {}, + "source": [ + "### C++" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3321c458-37a1-418e-a773-80b543369823", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "%%writefile cpp/1d/parameters.hpp.inc\n", + "\n", + " // (gemm_m, gemm_n, gemm_k, alpha, beta)\n", + " std::vector problems = {\n", + " {2048, 2048, 2048, 0.9, 1.1}\n", + " };" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19f484be-8f47-44de-af8e-a0dd037dba9b", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "%%writefile cpp/1d/cublasdx_config.hpp.inc\n", + " // 2. Define cuBLASDx description\n", + " constexpr int tile_m = 64;\n", + " constexpr int tile_n = 64;\n", + " constexpr int tile_k = 32;\n", + "\n", + " constexpr int block_dim = 256;\n", + "\n", + " using BLAS = decltype(cublasdx::Size() + // size of shared memory tile\n", + " cublasdx::Precision() + // precision of data (e.g. __nv_fp8_e5m2, __half, float)\n", + " cublasdx::Type() + // choice between `real` and `complex` number type\n", + " cublasdx::Function() + //BLAS operation, `MM` stands for Matrix Multiplication\n", + " cublasdx::Arrangement() + //Expected global memory data ordering (row or column major)\n", + " cublasdx::Block() + // Execution of operation\n", + " cublasdx::BlockDim() + // block to be used, can be 1D, 2D or 3D\n", + " cublasdx::MaxAlignment() + // will force max alignment on tensor pointers in shared memory\n", + " cublasdx::SM() + // Which architecture is this code targeting\n", + " cublasdx::StaticBlockDim());" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ff6d80f-ce4a-4352-a0f2-b11bd27fe3dd", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "%%writefile cpp/1d/kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(BLAS::max_threads_per_block, 1) __global__\n", + " void kernel_1b_simple_dgemm_shared_cublasdx(double alpha,\n", + " TensorA const tensor_a,\n", + " TensorB const tensor_b,\n", + " double beta,\n", + " TensorC const tensor_c) {\n", + " extern __shared__ __align__(16) unsigned char smem[];\n", + "\n", + " using alignment = cublasdx::alignment_of;\n", + "\n", + " // EXERCISE --> use slicing guide to prepare shared memory tensors\n", + "\n", + " auto const global_k = tutorial::size<1>(tensor_a);\n", + "\n", + " // Define accumulator storage\n", + " // EXERCISE --> use accumulator guide to prepare the accumulator\n", + " \n", + "\n", + " // EXERCISE --> Use partitioning guide to retrieve tile row from A, tile col from B and tile from C\n", + "\n", + " // Computation loop --> dynamic, cannot unroll\n", + " auto const max_tile_iters = // EXERCISE\n", + "\n", + " for (int tile_iter = 0; tile_iter < max_tile_iters; ++tile_iter) {\n", + "\n", + " // EXERCISE --> Load current tiles into shared memory tensors, use slicing and copying guides\n", + " // EXERCISE --> use BLAS.execute()\n", + " // EXERCISE --> figure out where to sync around BLAS\n", + " }\n", + "\n", + " // EXERCISE --> implement epilogue using either:\n", + " // 1. single index manual for-loop\n", + " // 2. retrieving global indices and using them for global store\n", + " // 3. separate partitioning and copying\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6fe7a94-11ff-421a-80c4-c6bda2d42c9b", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "!cmake --build build/ -t 1d_simple_dgemm_cublasdx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1143a06b-6935-41af-af81-7699157f2463", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "!./build/1d_simple_dgemm_cublasdx" + ] + }, + { + "cell_type": "markdown", + "id": "727dd04f-b8eb-4c96-9c89-e171c3fc0517", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "markdown", + "id": "6990e5dd-9718-441a-9ca1-3a7a1c9b4898", + "metadata": {}, + "source": [ + "We will rewrite kernel now and recompile the solution. If you want to restart your exercise make sure you rewrite kernel back and recompile it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "993b01ff-4c6f-4d22-b50b-7e41dab73227", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "%%writefile cpp/1d/kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(BLAS::max_threads_per_block, 1) __global__\n", + " void kernel_1c_dgemm_shared_cublasdx(double alpha,\n", + " TensorA const tensor_a,\n", + " TensorB const tensor_b,\n", + " double beta,\n", + " TensorC const tensor_c) {\n", + " extern __shared__ __align__(16) unsigned char smem[];\n", + "\n", + " using alignment = cublasdx::alignment_of;\n", + "\n", + " auto [smem_tensor_a, smem_tensor_b] =\n", + " cublasdx::shared_memory::slice(smem,\n", + " cublasdx::alignment_of_v_a,\n", + " BLAS::suggest_layout_smem_a(),\n", + " cublasdx::alignment_of_v_b,\n", + " BLAS::suggest_layout_smem_b());\n", + "\n", + " // Assert that for A: mxk and B: kxn both Ks are the same size\n", + " auto const global_k = tutorial::size<1>(tensor_a);\n", + "\n", + " // Define accumulator storage\n", + " auto accumulator = BLAS::suggest_accumulator();\n", + "\n", + " auto global_tile_row_a = cublasdx::get_tile_row(tensor_a, BLAS::a_shape, blockIdx.x);\n", + " auto global_tile_col_b = cublasdx::get_tile_col(tensor_b, BLAS::b_shape, blockIdx.y);\n", + "\n", + " auto global_tile_c = cublasdx::get_tile(tensor_c, BLAS::c_shape, blockIdx.x, blockIdx.y);\n", + " auto global_tile_out = cublasdx::get_tile(tensor_c, BLAS::c_shape, blockIdx.x, blockIdx.y);\n", + "\n", + " // Computation loop --> dynamic, cannot unroll\n", + " for (int tile_iter = 0; tile_iter < (global_k / cublasdx::size_of_v_k); ++tile_iter) {\n", + "\n", + " // Load current tile into shared memory\n", + " auto current_global_tile_a = global_tile_row_a(cublasdx::slice, cublasdx::slice, tile_iter);\n", + " auto current_global_tile_b = global_tile_col_b(cublasdx::slice, cublasdx::slice, tile_iter);\n", + "\n", + " cublasdx::copy(current_global_tile_a, smem_tensor_a);\n", + " cublasdx::copy(current_global_tile_b, smem_tensor_b);\n", + " cublasdx::copy_wait();\n", + "\n", + " BLAS().execute(smem_tensor_a, smem_tensor_b, accumulator);\n", + " __syncthreads();\n", + " }\n", + "\n", + " auto d_fragment = accumulator.make_partition_and_copy(global_tile_c);\n", + " cublasdx::axpby(alpha, accumulator.get_results(), beta, d_fragment);\n", + " accumulator.partition_and_copy(d_fragment, global_tile_out);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c49704bd-fcc6-4f14-b423-a06641a3ed34", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "!cmake --build build/ -t 1d_simple_dgemm_cublasdx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0385ded-d0d7-48ad-9663-8503ac5dfad6", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "!./build/1d_simple_dgemm_cublasdx" + ] + }, + { + "cell_type": "markdown", + "id": "71298be2-988c-4122-847f-1064e4f1f40d", + "metadata": {}, + "source": [ + "### Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae13316b-d151-4f7b-a104-eb8f8d73ed59", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "# The problems that we will benchmark and conduct accuracy tests on the tuple should be formed as:\n", + "# (GEMM_M, GEMM_N, GEMM_K, ALPHA, BETA)\n", + "problems = [\n", + " (2048, 2048, 2048, 0.9, 1.1),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b78f0da-a765-4968-a6fb-9bd41242204f", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "def choose_kernel_params_2_4(m, n, k, alpha, beta):\n", + " tile_m = 64\n", + " tile_n = 64\n", + " tile_k = 32\n", + " \n", + " block_size = 256\n", + " \n", + " return Matmul(\n", + " size=(tile_m, tile_n, tile_k),\n", + " precision=(np.float64, np.float64, np.float64),\n", + " data_type=\"real\",\n", + " arrangement=(\"row_major\", \"col_major\", \"col_major\"), # Do not change\n", + " execution=\"Block\",\n", + " block_size=block_size,\n", + " alignment=MAX_ALIGNMENT,\n", + " static_block_dim=True\n", + " )\n", + "\n", + "def get_shared_memory_size_2_4(BLAS):\n", + " smem_calc = SharedStorageCalc()\n", + " smem_calc.add(BLAS.alignment.a, np.dtype(BLAS.precision[0]).itemsize, BLAS.suggest_layout_smem_a())\n", + " smem_calc.add(BLAS.alignment.b, np.dtype(BLAS.precision[1]).itemsize, BLAS.suggest_layout_smem_b())\n", + " return smem_calc.get()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d7b518d-2ecf-4991-92bb-00ec34ede13c", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "def get_dgemm_kernel_2_4(BLAS):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " c_size = BLAS.suggest_layout_rmem_c().cosize\n", + "\n", + " tile_m, tile_n = BLAS.c_dim\n", + " tile_k = BLAS.a_dim[1]\n", + " alignment_a, alignment_b, alignment_c = BLAS.alignment\n", + " \n", + " @cuda.jit(launch_bounds=(BLAS.block_size, 1))\n", + " def dgemm_kernel(alpha, tensor_a, tensor_b, beta, tensor_c):\n", + " m, n = tensor_c.shape\n", + " _, k = tensor_a.shape\n", + "\n", + " lda = max(tensor_a.strides) // tensor_a.itemsize\n", + " ldb = max(tensor_b.strides) // tensor_b.itemsize\n", + " ldc = max(tensor_c.strides) // tensor_c.itemsize\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + "\n", + " smem = cuda.shared.array(shape=(0,), dtype=BLAS.a_value_type, alignment=16)\n", + " smem_a_buff, smem = smem[0:BLAS.a_size], smem[BLAS.a_size:]\n", + " smem_b_buff, smem = smem[0:BLAS.b_size], smem[BLAS.b_size:]\n", + "\n", + " block_start_m = block_m * tile_m\n", + " block_end_m = (block_m + 1) * tile_m\n", + "\n", + " block_start_n = block_n * tile_n\n", + " block_end_n = (block_n + 1) * tile_n\n", + "\n", + " if block_start_m >= m or block_start_n >= n:\n", + " return\n", + "\n", + " a_view = tensor_a[block_start_m : block_end_m, :]\n", + " b_view = tensor_b[:, block_start_n : block_end_n]\n", + " c_view = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " ]\n", + "\n", + " smem_a = make_tensor(smem_a_buff, BLAS.suggest_layout_smem_a())\n", + " smem_b = make_tensor(smem_b_buff, BLAS.suggest_layout_smem_b())\n", + " gmem_c = make_tensor(c_view, BLAS.get_layout_gmem_c(ldc))\n", + "\n", + " accumulator = BLAS.suggest_accumulator()\n", + "\n", + " stages = k // tile_k\n", + "\n", + " for stage in range(0, stages):\n", + " stage_start_k = stage * tile_k\n", + " stage_end_k = (stage + 1) * tile_k\n", + " \n", + " stage_a = a_view[:, stage_start_k : stage_end_k]\n", + " stage_b = b_view[stage_start_k : stage_end_k, :]\n", + "\n", + " gmem_a = make_tensor(stage_a, BLAS.get_layout_gmem_a(lda))\n", + " gmem_b = make_tensor(stage_b, BLAS.get_layout_gmem_b(ldb))\n", + "\n", + " # EXERCISE --> Load current tiles into shared memory tensors, use slicing and copying guides\n", + " # Use copy and copy_wait instead of cublasdx::copy and cublasdx::copy_wait\n", + " # Alignment is directly passed to copy\n", + " # EXERCISE --> use BLAS.execute()\n", + " # EXERCISE --> figure out where to sync around BLAS\n", + "\n", + " # EXERCISE --> implement epilogue using either:\n", + " # 1. single index manual for-loop\n", + " # 2. retrieving global indices and using them for global store\n", + " # 3. separate partitioning and copying\n", + "\n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "474095a0-0efd-4617-a14a-037ac07330df", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "benchmark_dgemm_2_4(problems, get_dgemm_kernel_2_4, choose_kernel_params_2_4, get_shared_memory_size_2_4)" + ] + }, + { + "cell_type": "markdown", + "id": "1d29e6fb-5353-44aa-ba54-f322ddbd47c5", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6e755d6-09a6-48d9-8b1b-cfd7aaad65db", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "def get_dgemm_kernel_2_4_solution(BLAS):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " c_size = BLAS.suggest_layout_rmem_c().cosize\n", + "\n", + " tile_m, tile_n = BLAS.c_dim\n", + " tile_k = BLAS.a_dim[1]\n", + " alignment_a, alignment_b, alignment_c = BLAS.alignment\n", + " \n", + " @cuda.jit(launch_bounds=(BLAS.block_size, 1))\n", + " def dgemm_kernel(alpha, tensor_a, tensor_b, beta, tensor_c):\n", + " m, n = tensor_c.shape\n", + " _, k = tensor_a.shape\n", + "\n", + " lda = max(tensor_a.strides) // tensor_a.itemsize\n", + " ldb = max(tensor_b.strides) // tensor_b.itemsize\n", + " ldc = max(tensor_c.strides) // tensor_c.itemsize\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + "\n", + " smem = cuda.shared.array(shape=(0,), dtype=BLAS.a_value_type, alignment=16)\n", + " smem_a_buff, smem = smem[0:BLAS.a_size], smem[BLAS.a_size:]\n", + " smem_b_buff, smem = smem[0:BLAS.b_size], smem[BLAS.b_size:]\n", + "\n", + " block_start_m = block_m * tile_m\n", + " block_end_m = (block_m + 1) * tile_m\n", + "\n", + " block_start_n = block_n * tile_n\n", + " block_end_n = (block_n + 1) * tile_n\n", + "\n", + " if block_start_m >= m or block_start_n >= n:\n", + " return\n", + "\n", + " a_view = tensor_a[block_start_m : block_end_m, :]\n", + " b_view = tensor_b[:, block_start_n : block_end_n]\n", + " c_view = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " ]\n", + "\n", + " smem_a = make_tensor(smem_a_buff, BLAS.suggest_layout_smem_a())\n", + " smem_b = make_tensor(smem_b_buff, BLAS.suggest_layout_smem_b())\n", + " gmem_c = make_tensor(c_view, BLAS.get_layout_gmem_c(ldc))\n", + "\n", + " accumulator = BLAS.suggest_accumulator()\n", + "\n", + " stages = k // tile_k\n", + "\n", + " for stage in range(0, stages):\n", + " stage_start_k = stage * tile_k\n", + " stage_end_k = (stage + 1) * tile_k\n", + " \n", + " stage_a = a_view[:, stage_start_k : stage_end_k]\n", + " stage_b = b_view[stage_start_k : stage_end_k, :]\n", + "\n", + " gmem_a = make_tensor(stage_a, BLAS.get_layout_gmem_a(lda))\n", + " gmem_b = make_tensor(stage_b, BLAS.get_layout_gmem_b(ldb))\n", + "\n", + " copy(gmem_a, smem_a, alignment=alignment_a)\n", + " copy(gmem_b, smem_b, alignment=alignment_b)\n", + " copy_wait()\n", + "\n", + " BLAS.execute(smem_a, smem_b, accumulator)\n", + "\n", + " cuda.syncthreads()\n", + "\n", + " d_fragment = accumulator.make_partition_and_copy(gmem_c)\n", + " axpby(alpha, accumulator.get_results(), beta, d_fragment)\n", + " accumulator.partition_and_copy(d_fragment, gmem_c)\n", + "\n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "935abce0-41af-4ec0-8f31-15f403a69a6f", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "benchmark_dgemm_2_4(problems, get_dgemm_kernel_2_4_solution, choose_kernel_params_2_4, get_shared_memory_size_2_4)" + ] + }, + { + "cell_type": "markdown", + "id": "d8f3e1ee-454c-4a5b-8544-7ba72baa8981", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this notebook approaches to using cuBLASDx in device code were presented: the tile approach and the pipelined approach demonstrating how to efficiently use the library" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/CMakeLists.txt b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/CMakeLists.txt new file mode 100644 index 00000000..eecf6fd0 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 4.0) + +LIST(APPEND CMAKE_PROGRAM_PATH "/usr/local/cuda-13.1/bin") +project(cublasdx-dgemm-notebook-1 VERSION 0.1 LANGUAGES CUDA CXX) + +# Add header tutorial helper files +add_library(tutorial_helpers INTERFACE) +target_include_directories(tutorial_helpers INTERFACE ../../cpp_source/include/) + +include(../../cmake/tutorial.cmake) + +add_tutorial(1a_simple_dgemm_tensor cpp/1a/1a_simple_dgemm_tensor.cu) +add_tutorial(1b_simple_dgemm_shared cpp/1b/1b_simple_dgemm_shared.cu) +add_tutorial(1c_simple_pipelined_dgemm cpp/1c/1c_simple_pipelined_dgemm.cu) +add_tutorial(1d_simple_dgemm_cublasdx cpp/1d/1d_simple_dgemm_cublasdx.cu) + diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/.gitignore b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/.gitignore new file mode 100644 index 00000000..163d4aec --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/.gitignore @@ -0,0 +1 @@ +**/*.hpp.inc diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1a/1a_simple_dgemm_tensor.cu b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1a/1a_simple_dgemm_tensor.cu new file mode 100644 index 00000000..4f015fae --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1a/1a_simple_dgemm_tensor.cu @@ -0,0 +1,132 @@ +// std libraries +#include + +// cuda std libraries +#include +#include + +// cuda libraries +#include + +// utility headers +#include + +using cublasdx::arrangement; +using size_tuple = cuda::std::tuple; +using arrangement_tuple = cuda::std::tuple; + +#include "kernel.hpp.inc" + +template +auto run_tutorial_kernel(Alpha alpha, + ATensor const& tensor_a, + BTensor const& tensor_b, + Beta beta, + CTensor& tensor_c, + cudaStream_t stream = 0, + int warm_up_runs = 10, + int kernel_runs = 100) { + auto const size_m = tutorial::size<0>(tensor_a.layout()); + auto const size_n = tutorial::size<1>(tensor_b.layout()); + auto const size_k = tutorial::size<1>(tensor_a.layout()); + + #include "kernel_parameters.hpp.inc" + + int const grid_dim_x = cuda::ceil_div(size_m, block_dim_x); + int const grid_dim_y = cuda::ceil_div(size_n, block_dim_y); + + auto const block_dim = dim3(block_dim_x, block_dim_y); + auto const grid_dim = dim3(grid_dim_x, grid_dim_y); + + using a_value_type = tutorial::tensor_value_type_t; + using b_value_type = tutorial::tensor_value_type_t; + using c_value_type = tutorial::tensor_value_type_t; + const int result_size = tutorial::size(tensor_c); + std::vector results(result_size); + + auto run_kernel = [&](auto& str) { + kernel_1a_simple_dgemm + <<>>(alpha, tensor_a, tensor_b, beta, tensor_c); + }; + + // correctness run + run_kernel(stream); + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(results.data(), + tutorial::raw_pointer_cast(tensor_c.data()), + result_size * sizeof(c_value_type), + cudaMemcpyDeviceToHost)); + + // performance runs + auto avg_time = tutorial::measure::execution(run_kernel, warm_up_runs, kernel_runs, stream); + auto avg_tflops = tutorial::real_gemm_tflops(size_m, size_n, size_k) / (avg_time); + return cuda::std::make_tuple(avg_time, avg_tflops, results); +} + +int main(int argc, char** argv) { + // 0. Setup problem size and layout + constexpr arrangement arr_a = arrangement::row_major; + constexpr arrangement arr_b = arrangement::col_major; + constexpr arrangement arr_c = arrangement::col_major; + + // Parameters configuring stability of performance measurement + // Number of inital runs outside of measurement + int const warm_up_runs = 10; + // Number of runs to be measured and averaged into final result + int const kernel_runs = 100; + + #include "parameters.hpp.inc" + + for (tutorial::gemm_problem_t problem : problems) { + int const m = problem.m; + int const n = problem.n; + int const k = problem.k; + double const alpha = problem.alpha; + double const beta = problem.beta; + + std::cout << "Computing GEMM M=" << m << " N=" << n << " K=" << k << "\n"; + + // 0.5 Setup CUDA runtime + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + // 1. Generate tensors with random data + // vector is an owning thrust::device_vector, while tensor is a view + + auto [vector_a, tensor_a] = tutorial::get_random_device_tensor(m, k); + auto [vector_b, tensor_b] = tutorial::get_random_device_tensor(k, n); + + auto [vector_c_custom, tensor_c_custom] = tutorial::get_random_device_tensor(m, n); + auto [vector_c_reference, tensor_c_reference] = tutorial::get_copy_tensor(tensor_c_custom); + + // 2. Run reference + auto [time_reference, tflops_reference, results_reference] = tutorial::cublaslt_reference( + alpha, tensor_a, tensor_b, beta, tensor_c_reference, stream, warm_up_runs, kernel_runs); + + // 3. Run custom kernel + auto [time_tutorial, tflops_tutorial, results_tutorial] = + run_tutorial_kernel(alpha, tensor_a, tensor_b, beta, tensor_c_custom, stream, warm_up_runs, kernel_runs); + + // 5. Print performance and correctness summary + std::cout << "\nCustom Kernel\n"; + std::cout << std::fixed << std::setprecision(4); + std::cout << "Avg time [ms] = " << time_tutorial << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_tutorial << "\n"; + + std::cout << "\ncuBLASLt (not including heuristic)\n"; + std::cout << "Avg time [ms] = " << time_reference << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_reference << "\n\n"; + + constexpr bool verbose_knob = false; + constexpr bool print_knob = true; + + auto error = tutorial::calculate_error(results_tutorial, results_reference, verbose_knob, print_knob); + std::cout << std::fixed << std::setprecision(10) << "Total relative error = " << error << "\n"; + + std::cout << std::fixed << std::setprecision(2) << (tflops_tutorial / tflops_reference) * 100 + << "% reference performance \n"; + } + + return 0; +} diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1b/1b_simple_dgemm_shared.cu b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1b/1b_simple_dgemm_shared.cu new file mode 100644 index 00000000..26bccf2b --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1b/1b_simple_dgemm_shared.cu @@ -0,0 +1,134 @@ +// std libraries +#include + +// CUDA std libraries +#include +#include + +// CUDA libraries +#include + +// utility headers +#include + +using cublasdx::arrangement; +using size_tuple = cuda::std::tuple; +using arrangement_tuple = cuda::std::tuple; + +#include "kernel.hpp.inc" + +template +auto run_tutorial_kernel(Alpha alpha, + ATensor const& tensor_a, + BTensor const& tensor_b, + Beta beta, + CTensor& tensor_c, + cudaStream_t stream = 0, + int warm_up_runs = 10, + int kernel_runs = 100) { + auto const size_m = tutorial::size<0>(tensor_a.layout()); + auto const size_n = tutorial::size<1>(tensor_b.layout()); + auto const size_k = tutorial::size<1>(tensor_a.layout()); + + #include "kernel_parameters.hpp.inc" + + // 1.5 Compute remaining configuration arguments + int const grid_dim_x = size_m / tile_m; + int const grid_dim_y = size_n / tile_n; + + auto const block_dim = dim3(tile_m, tile_n); + auto const grid_dim = dim3(grid_dim_x, grid_dim_y); + + using a_value_type = tutorial::tensor_value_type_t; + using b_value_type = tutorial::tensor_value_type_t; + using c_value_type = tutorial::tensor_value_type_t; + const int result_size = tutorial::size(tensor_c); + std::vector results(result_size); + + auto shared_memory_size = (tile_m * tile_k + tile_k * tile_n) * sizeof(double); + auto kernel = kernel_1b_simple_dgemm_shared; + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + + auto run_kernel = [&](auto& str) { + kernel<<>>(alpha, tensor_a, tensor_b, beta, tensor_c); + }; + + // correctness run + run_kernel(stream); + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(results.data(), + tutorial::raw_pointer_cast(tensor_c.data()), + result_size * sizeof(c_value_type), + cudaMemcpyDeviceToHost)); + + // performance runs + auto avg_time = tutorial::measure::execution(run_kernel, warm_up_runs, kernel_runs, stream); + auto avg_tflops = tutorial::real_gemm_tflops(size_m, size_n, size_k) / (avg_time); + return cuda::std::make_tuple(avg_time, avg_tflops, results); +} + +int main(int argc, char** argv) { + // 0. Setup problem size and layout + constexpr arrangement arr_a = arrangement::row_major; + constexpr arrangement arr_b = arrangement::col_major; + constexpr arrangement arr_c = arrangement::col_major; + + int const warm_up_runs = 10; + int const kernel_runs = 100; + + #include "parameters.hpp.inc" + + for (tutorial::gemm_problem_t problem : problems) { + int const m = problem.m; + int const n = problem.n; + int const k = problem.k; + double const alpha = problem.alpha; + double const beta = problem.beta; + + std::cout << "Computing GEMM M=" << m << " N=" << n << " K=" << k << "\n"; + + // 0.5 Setup CUDA runtime + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + // 1. Generate input data + + // 2. Generate tensors with random data + // vector is an owning thrust::device_vector, while tensor is a view + auto [vector_a, tensor_a] = tutorial::get_random_device_tensor(m, k); + auto [vector_b, tensor_b] = tutorial::get_random_device_tensor(k, n); + + auto [vector_c_custom, tensor_c_custom] = tutorial::get_random_device_tensor(m, n); + auto [vector_c_reference, tensor_c_reference] = tutorial::get_copy_tensor(tensor_c_custom); + + // 3. Run reference + auto [time_reference, tflops_reference, results_reference] = tutorial::cublaslt_reference( + alpha, tensor_a, tensor_b, beta, tensor_c_reference, stream, warm_up_runs, kernel_runs); + + // 4. Run custom kernel + auto [time_tutorial, tflops_tutorial, results_tutorial] = + run_tutorial_kernel(alpha, tensor_a, tensor_b, beta, tensor_c_custom, stream, warm_up_runs, kernel_runs); + + // 5. Print performance and correctness summary + std::cout << "\nCustom Kernel\n"; + std::cout << std::fixed << std::setprecision(4); + std::cout << "Avg time [ms] = " << time_tutorial << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_tutorial << "\n"; + + std::cout << "\ncuBLASLt (not including heuristic)\n"; + std::cout << "Avg time [ms] = " << time_reference << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_reference << "\n\n"; + + constexpr bool verbose_knob = false; + constexpr bool print_knob = true; + + auto error = tutorial::calculate_error(results_tutorial, results_reference, verbose_knob, print_knob); + std::cout << std::fixed << std::setprecision(10) << "Total relative error = " << error << "\n"; + + std::cout << std::fixed << std::setprecision(2) << (tflops_tutorial / tflops_reference) * 100 + << "% reference performance \n"; + } + + return 0; +} diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1c/1c_simple_pipelined_dgemm.cu b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1c/1c_simple_pipelined_dgemm.cu new file mode 100644 index 00000000..32f95d95 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1c/1c_simple_pipelined_dgemm.cu @@ -0,0 +1,128 @@ +// std libraries +#include + +// cuda std libraries +#include +#include + +// cuda libraries +#include + +// utility headers +#include + +using cublasdx::arrangement; + +#include "kernel.hpp.inc" + +template +auto run_tutorial_kernel(Alpha alpha, + ATensor const& tensor_a, + BTensor const& tensor_b, + Beta beta, + CTensor& tensor_c, + cudaStream_t stream = 0, + int warm_up_runs = 10, + int kernel_runs = 100) { + auto const size_m = tutorial::size<0>(tensor_a.layout()); + auto const size_n = tutorial::size<1>(tensor_b.layout()); + auto const size_k = tutorial::size<1>(tensor_a.layout()); + + using a_value_type = tutorial::tensor_value_type_t; + using b_value_type = tutorial::tensor_value_type_t; + using c_value_type = tutorial::tensor_value_type_t; + const int result_size = tutorial::size(tensor_c); + std::vector results(result_size); + + #include "pipeline_config.hpp.inc" + + // description elements passed to cuBLASDx descriptor can be retrieved using unified traits: + int const grid_dim_x = size_m / cublasdx::size_of_v_m; + int const grid_dim_y = size_n / cublasdx::size_of_v_n; + + auto const grid_dim = dim3(grid_dim_x, grid_dim_y); + + // Increase max dynamic shared memory for the kernel if needed. + #include "kernel_config.hpp.inc" + + auto run_kernel = [&](auto& str) { + // Pipeline exposes precomputed block dimension to be used in kernel launch + kernel<<>>(alpha, beta, tensor_c, device_pipeline); + }; + + // correctness run + run_kernel(stream); + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(results.data(), + tutorial::raw_pointer_cast(tensor_c.data()), + result_size * sizeof(c_value_type), + cudaMemcpyDeviceToHost)); + + // performance runs + auto avg_time = tutorial::measure::execution(run_kernel, warm_up_runs, kernel_runs, stream); + auto avg_tflops = tutorial::real_gemm_tflops(size_m, size_n, size_k) / avg_time; + return cuda::std::make_tuple(avg_time, avg_tflops, results); +} + +int main(int argc, char** argv) { + // 0. Setup problem size and layout + constexpr arrangement arr_a = arrangement::row_major; + constexpr arrangement arr_b = arrangement::col_major; + constexpr arrangement arr_c = arrangement::col_major; + + int const warm_up_runs = 10; + int const kernel_runs = 100; + + #include "parameters.hpp.inc" + + for (tutorial::gemm_problem_t problem : problems) { + int const m = problem.m; + int const n = problem.n; + int const k = problem.k; + double const alpha = problem.alpha; + double const beta = problem.beta; + + std::cout << "Computing GEMM M=" << m << " N=" << n << " K=" << k << "\n"; + + // 0.5 Setup CUDA runtime + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + #include "cublasdx_config.hpp.inc" + + auto [vector_a, tensor_a] = tutorial::get_random_device_tensor(m, k); + auto [vector_b, tensor_b] = tutorial::get_random_device_tensor(k, n); + + auto [vector_c_custom, tensor_c_custom] = tutorial::get_random_device_tensor(m, n); + auto [vector_c_reference, tensor_c_reference] = tutorial::get_copy_tensor(tensor_c_custom); + + // 2. Run reference + auto [time_reference, tflops_reference, results_reference] = tutorial::cublaslt_reference( + alpha, tensor_a, tensor_b, beta, tensor_c_reference, stream, warm_up_runs, kernel_runs); + + auto [time_tutorial, tflops_tutorial, results_tutorial] = + run_tutorial_kernel(alpha, tensor_a, tensor_b, beta, tensor_c_custom, stream, warm_up_runs, kernel_runs); + + // 5. Print performance and correctness summary + std::cout << "\nCustom Kernel\n"; + std::cout << std::fixed << std::setprecision(4); + std::cout << "Avg time [ms] = " << time_tutorial << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_tutorial << "\n"; + + std::cout << "\ncuBLASLt (not including heuristic)\n"; + std::cout << "Avg time [ms] = " << time_reference << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_reference << "\n\n"; + + constexpr bool verbose_knob = false; + constexpr bool print_knob = true; + + auto error = tutorial::calculate_error(results_tutorial, results_reference, verbose_knob, print_knob); + std::cout << std::fixed << std::setprecision(10) << "Total relative error = " << error << "\n"; + + std::cout << std::fixed << std::setprecision(2) << (tflops_tutorial / tflops_reference) * 100 + << "% reference performance \n"; + } + + return 0; +} diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1d/1d_simple_dgemm_cublasdx.cu b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1d/1d_simple_dgemm_cublasdx.cu new file mode 100644 index 00000000..b2e3722b --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/cpp/1d/1d_simple_dgemm_cublasdx.cu @@ -0,0 +1,129 @@ +// std libraries +#include + +// cuda std libraries +#include +#include + +#include + +// utility headers +#include + +using cublasdx::arrangement; + +#include "kernel.hpp.inc" + +template +auto run_tutorial_kernel(Alpha alpha, + ATensor const& tensor_a, + BTensor const& tensor_b, + Beta beta, + CTensor& tensor_c, + cudaStream_t stream = 0, + int warm_up_runs = 10, + int kernel_runs = 100) { + auto const size_m = tutorial::size<0>(tensor_a.layout()); + auto const size_n = tutorial::size<1>(tensor_b.layout()); + auto const size_k = tutorial::size<1>(tensor_a.layout()); + + using result_type = tutorial::tensor_value_type_t; + + const int result_size = tutorial::size(tensor_c); + std::vector results(result_size); + + // 1.5 Compute remaining configuration arguments + + int const grid_dim_x = size_m / cublasdx::size_of_v_m; + int const grid_dim_y = size_n / cublasdx::size_of_v_n; + + auto const grid_dim = dim3(grid_dim_x, grid_dim_y); + auto shared_memory_size = cublasdx::get_shared_storage_size_ab(); + auto kernel = kernel_1c_dgemm_shared_cublasdx; + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + + auto run_kernel = [&](auto& str) { + kernel<<>>(alpha, tensor_a, tensor_b, beta, tensor_c); + }; + + // correctness run + run_kernel(stream); + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(results.data(), + tutorial::raw_pointer_cast(tensor_c.data()), + result_size * sizeof(result_type), + cudaMemcpyDeviceToHost)); + + // performance runs + auto avg_time = tutorial::measure::execution(run_kernel, warm_up_runs, kernel_runs, stream); + auto avg_tflops = tutorial::real_gemm_tflops(size_m, size_n, size_k) / avg_time; + return cuda::std::make_tuple(avg_time, avg_tflops, results); +} + +int main(int argc, char** argv) { + // 0. Setup problem size and layout + constexpr arrangement arr_a = arrangement::row_major; + constexpr arrangement arr_b = arrangement::col_major; + constexpr arrangement arr_c = arrangement::col_major; + + // Parameters configuring stability of performance measurement + // Number of inital runs outside of measurement + int const warm_up_runs = 10; + // Number of runs to be measured and averaged into final result + int const kernel_runs = 100; + + #include "parameters.hpp.inc" + + for (tutorial::gemm_problem_t problem : problems) { + int const m = problem.m; + int const n = problem.n; + int const k = problem.k; + double const alpha = problem.alpha; + double const beta = problem.beta; + + std::cout << "Computing GEMM M=" << m << " N=" << n << " K=" << k << "\n"; + + // 0.5 Setup CUDA runtime + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + #include "cublasdx_config.hpp.inc" + + // 3. Generate tensors with random data + // vector is an owning thrust::device_vector, while tensor is a view + auto [vector_a, tensor_a] = tutorial::get_random_device_tensor(m, k); + auto [vector_b, tensor_b] = tutorial::get_random_device_tensor(k, n); + + auto [vector_c_custom, tensor_c_custom] = tutorial::get_random_device_tensor(m, n); + auto [vector_c_reference, tensor_c_reference] = tutorial::get_copy_tensor(tensor_c_custom); + + // 4. Run reference + auto [time_reference, tflops_reference, results_reference] = tutorial::cublaslt_reference( + alpha, tensor_a, tensor_b, beta, tensor_c_reference, stream, warm_up_runs, kernel_runs); + + auto [time_tutorial, tflops_tutorial, results_tutorial] = + run_tutorial_kernel(alpha, tensor_a, tensor_b, beta, tensor_c_custom, stream, warm_up_runs, kernel_runs); + + // 5. Print performance and correctness summary + std::cout << "\nCustom Kernel\n"; + std::cout << std::fixed << std::setprecision(4); + std::cout << "Avg time [ms] = " << time_tutorial << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_tutorial << "\n"; + + std::cout << "\ncuBLASLt (not including heuristic)\n"; + std::cout << "Avg time [ms] = " << time_reference << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_reference << "\n\n"; + + constexpr bool verbose_knob = false; + constexpr bool print_knob = true; + + auto error = tutorial::calculate_error(results_tutorial, results_reference, verbose_knob, print_knob); + std::cout << std::fixed << std::setprecision(10) << "Total relative error = " << error << "\n"; + + std::cout << std::fixed << std::setprecision(2) << (tflops_tutorial / tflops_reference) * 100 + << "% reference performance \n"; + } + + return 0; +} diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/cublasdx.png b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/cublasdx.png new file mode 100644 index 00000000..138c5ee2 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/cublasdx.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9db9e9be9cc0fea9a64012ae9df7c6a177cf9d0ed0456abebd031c5ef99f1457 +size 155053 diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/device_gemm.svg b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/device_gemm.svg new file mode 100644 index 00000000..8d563aca --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/device_gemm.svg @@ -0,0 +1,4 @@ + + + +
M
K
N
K
N
M
tile_n
tile_m
tile_m
tile_k
tile_n
tile_k
B
A
C
\ No newline at end of file diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/gemm_sizes.svg b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/gemm_sizes.svg new file mode 100644 index 00000000..245ef765 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/gemm_sizes.svg @@ -0,0 +1,4 @@ + + + +
A
B
C
N
K
K
M
M
N
\ No newline at end of file diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/naive_gemm.png b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/naive_gemm.png new file mode 100644 index 00000000..bbacf25c --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/naive_gemm.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a06f7b329cc0d8a1d5ed6a4d163003127a62e141fc817b133090c858ee70e99 +size 111896 diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/naive_gemm.svg b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/naive_gemm.svg new file mode 100644 index 00000000..b4f9cccf --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/naive_gemm.svg @@ -0,0 +1,4 @@ + + + +
N
K
K
M
M
N
(0, 0)
C
A
(0, 0)
(0, 1)
(0, 2)
(0, 3)
B
(0,0)
(1, 0)
(2, 0)
(3, 0)

Example of naive kernel mapping for M = N = K = 4

\ No newline at end of file diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/pipeline.png b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/pipeline.png new file mode 100644 index 00000000..da1e8212 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/pipeline.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4bee8c1f8d557af9796f531fe235f0180f912cfbb65b70bc6fc4430017806ec +size 64391 diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/pipeline.svg b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/pipeline.svg new file mode 100644 index 00000000..96fba373 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/pipeline.svg @@ -0,0 +1,4 @@ + + + +
Load 1
Compute 1
Load 2
Compute 2
Load 3
Compute 3
Load 4
Compute 4
Single stage GEMM
Load 1
Compute 1
Load 2
Compute 2
Load 3
Compute 3
Load 4
Compute 4
2 stage pipelined GEMM
Compute 1
Compute 2
Compute 3
Compute 4
4 stage pipelined GEMM
Load 1
Load 2
Load 3
Load 4
\ No newline at end of file diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/pipelined_execution.svg b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/pipelined_execution.svg new file mode 100644 index 00000000..6443fa7b --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/pipelined_execution.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/redundancy.svg b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/redundancy.svg new file mode 100644 index 00000000..0f28b2bd --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/redundancy.svg @@ -0,0 +1,4 @@ + + + +
N
K
K
M
M
N
C
A
B

Example of naive kernel mapping for M = N = K = 4

\ No newline at end of file diff --git a/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/tiling.png b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/tiling.png new file mode 100644 index 00000000..40d954f1 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/02-Matmul-Fundamentals/images/tiling.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f05fdd5c187d8f93cf0b664dd7c6bf3b3425bd8aea955ba7d7c3223334a85ca +size 204607 diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/03.01-UnfusedEmulation.ipynb b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/03.01-UnfusedEmulation.ipynb new file mode 100644 index 00000000..7e183770 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/03.01-UnfusedEmulation.ipynb @@ -0,0 +1,983 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "237c0324-0fed-45bc-883b-52b975ad1927", + "metadata": {}, + "source": [ + "# Getting Started With Emulation\n", + "\n", + "## Content\n", + "\n", + " - Introduction to the Ozaki-I Scheme\n", + " - Exercise 1: IGEMM-based Ozaki-I Scheme \n", + " - Exercise 2: Optimizing the Ozaki-I Scheme with kernel fusion\n", + " - Exercise 3: Fully fused Ozaki-I Scheme" + ] + }, + { + "cell_type": "markdown", + "id": "dd5710fd-38ec-4e28-8393-d30528d09ab4", + "metadata": {}, + "source": [ + "## The Ozaki-I Scheme\n", + "\n", + "The Ozaki Scheme was introduced as a general framework for emulating high precision data types as a combination of lower precision datatypes. This was later applied to integer matrix multiplication in the paper \"[DGEMM on Integer Matrix Multiplication Unit](https://arxiv.org/abs/2306.11975)\" by Hiroyuki Ootomo, Katsuhisa Ozaki, and Rio Yokota. When people refer to the Ozaki Scheme, it is usually implied that we are refering to the Ozaki-I scheme with integers. For the remainder of the tutorial, we will mean the same.\n", + "\n", + "This algorithm consists of three parts:\n", + "\n", + "1. Slicing - A method to transform inputs into lower precision datatypes which, refered to as slices\n", + "2. Slice Multiplication - Multiplying the corresponding A slices by the corresponding B slices\n", + "3. Error-free transformation - This takes the output of multiplication and transforms it into the higher precision datatype.\n", + "\n", + "For this tutorial, we will give high level background on (1) and focus on steps (2) and (3). If you are interested in learning more about (1), the code is present and documented within this tutorial. We would also encourage you to review the [cubladsDx emulation sample](https://github.com/NVIDIA/CUDALibrarySamples/tree/main/MathDx/cuBLASDx/16_dgemm_emulation).\n", + "\n", + "A diagram of the high level workflow can be found below:\n", + "\n", + "\n", + "\n", + "To perform slicing, we first need to know the maximum values for the rows of A and columns of B. These are needed to logically align the exponents for elements in the same row (for matrix A) or column (for matrix B). Once the exponents are aligned, we can read the mantissa bits into INT8 slices. In practice, this is done with logical operations rather than FP64 arithmetic for a performance advantage.\n", + "\n", + "The next phase is slice multiplication, where each slice of \"A\" can multiply with each slice of \"B\". As an optimization with minimal accuracy impact, we only multiply a subset of the slices as seen below. It is important to note that in production libraries like cuBLAS, the ADP framework (see [Guaranteed DGEMM Accuracy While Using Reduced Precision Tensor Cores Through Extensions of the Ozaki Scheme](https://arxiv.org/abs/2511.13778)) can detect if the lower order products are necessary and will leverage more integer slices to maintain FP64 level accuracy\n", + "\n", + "\n", + "\n", + "The last phase of this algorithm, shown at the bottom of the image above, is the error free transformation back into FP64. The steps involved are:\n", + "\n", + "1. Accumulate products along anti-diagonals\n", + "2. Converting the resulting accumulators into FP64\n", + "3. Scaling the accumulators according to which anti-diagonal it represents\n", + "4. Scaling the result once more to undo the exponent normalization in slicing\n", + "\n", + "## Exercise 3.1: IGEMM-based Ozaki-I Scheme\n", + "\n", + "The goal of this exercise will be to build a high level understanding of the Ozaki-I Scheme that we can use to further optimize in further exercise.\n", + "\n", + "In this exercise, we will orchestrate the right slice products and build an epilogue kernel which implements the error free transformation." + ] + }, + { + "cell_type": "markdown", + "id": "c9888bf9-ace5-44aa-9b7c-1be840a3206e", + "metadata": {}, + "source": [ + "### C++ Cmake Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12ed88e1-c428-414f-82cf-a332c7186ff3", + "metadata": {}, + "outputs": [], + "source": [ + "import sys, os\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "from common_cuda import setup_cmake_project\n", + "setup_cmake_project()" + ] + }, + { + "cell_type": "markdown", + "id": "bbcd0aa1-5c58-4cfc-8df8-d23306829fe1", + "metadata": {}, + "source": [ + "### Python Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a020b4b9-025a-48df-a91c-260ca81a7f41", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "import math\n", + "\n", + "import numpy as np\n", + "import cupy as cp\n", + "import nvmath\n", + "\n", + "from nvmath.device import Matmul\n", + "from nvmath.device.cublasdx import DevicePipeline, SharedStorageCalc, MAX_ALIGNMENT\n", + "from nvmath.device.cublasdx_numba import pipeline_extensions\n", + "from nvmath.device.common import axpby, clear, copy, copy_fragment, copy_wait, make_tensor\n", + "from numba import cuda\n", + "\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "\n", + "from benchmark import *\n", + "from emulation_utils import get_width, epilogue_ldexp" + ] + }, + { + "cell_type": "markdown", + "id": "fdbf54e6-de9d-47d9-8ecb-6bd7901d1224", + "metadata": {}, + "source": [ + "### C++" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e43f536-d2ed-4c9f-89ed-9ad608ed06a3", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2a_unfused_emulation/parameters.hpp.inc\n", + "\n", + " // ===================================\n", + " // Problem configuration\n", + " // ===================================\n", + "\n", + " // (gemm_m, gemm_n, gemm_k, alpha, beta)\n", + " std::vector problems = {\n", + " {2048, 2048, 2048, 0.9, 1.1}\n", + " };\n", + "\n", + " // ===================================\n", + " // Global GEMM configuration\n", + " // ===================================\n", + "\n", + " // The number of slices used in emulation algorithm\n", + " // More slices = higher precision but more computation\n", + " constexpr unsigned slices = 7;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5a00cd6-d134-4e61-9ce7-e254a44d8adb", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2a_unfused_emulation/cublasdx_config.hpp.inc\n", + "\n", + " using slice_value_type = int8_t; // Precision for individual slices\n", + " using accumulator_value_type = int32_t; // Precision for accumulation\n", + "\n", + " // The shape of data tile processed by a single CTA block\n", + " constexpr int tile_m = 128;\n", + " constexpr int tile_n = 128;\n", + " constexpr int tile_k = 128;\n", + "\n", + " // The shape of CTA block (number of threads)\n", + " constexpr int cta_shape_x = 128;\n", + " constexpr int cta_shape_y = 1;\n", + " constexpr int cta_shape_z = 1;\n", + "\n", + " using BLAS = decltype(cublasdx::Size() +\n", + " cublasdx::Precision() +\n", + " cublasdx::Type() + cublasdx::Function() +\n", + " cublasdx::Arrangement() + cublasdx::Block() +\n", + " cublasdx::BlockDim() + cublasdx::StaticBlockDim() +\n", + " cublasdx::WithPipeline() + cublasdx::MaxAlignment() + cublasdx::EnableInputStreaming() +\n", + " cublasdx::SM());" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12b4c8ba-1847-4a8a-96d3-07d426eb0169", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2a_unfused_emulation/igemm_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__\n", + " void igemm_kernel(__grid_constant__ DevicePipeline const device_pipeline,\n", + " OutTensor out_tensor) {\n", + " extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[];\n", + "#ifdef __CUDA_ARCH__\n", + " if constexpr (cublasdx::sm_of_v == __CUDA_ARCH__) {\n", + " auto tile_pipeline = device_pipeline.get_tile(smem, blockIdx.x, blockIdx.y);\n", + " auto tile_gmem_out = cublasdx::get_tile(out_tensor, BLAS::c_shape, blockIdx.x, blockIdx.y);\n", + "\n", + " auto accumulator = tile_pipeline.get_accumulator();\n", + "\n", + " tile_pipeline.execute(accumulator); \n", + " if (accumulator.is_thread_active()) {\n", + " accumulator.partition_and_store(tile_gmem_out);\n", + " }\n", + " }\n", + "#endif\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b68b360f-bd0c-473c-bf8c-3155df5b5656", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2a_unfused_emulation/slice_coordination.hpp.inc\n", + "\n", + " int product_idx = 0;\n", + " int num_products = (Slices * (Slices + 1)) / 2;\n", + "\n", + " /*\n", + " * EXERCISE --> Coordinate the products between slices of matrix A and slices of matrix B\n", + " * NOTE that tensor_slice_a is shaped like (m, k, nslices) and\n", + " * tensor_slice_b is shaped like (k, n, nslices)\n", + " * with strides being (k, 1, m * k) and (1, k, k * n) respectively\n", + " * \n", + " * Compute only the most significant products shown in the diagram above and store them into the\n", + " * tensor_products which has shape (m, n, num_products)\n", + " * with shape (m, n, num_products)\n", + " */\n", + " constexpr auto initial_diag = ;\n", + " constexpr auto initial_term = ;\n", + " \n", + " for (auto diag = initial_diag; /* EXERCISE --> loop over diagonals */) {\n", + " for (auto term = initial_term; /* EXERCISE --> loop along the diagonal */) {\n", + " // EXERCISE --> Determine which slice of A and slice of B to multiply\n", + " int slice_a_index = ;\n", + " int slice_b_index = ;\n", + "\n", + " // Prepare our view of the tensors, in this case we are getting the int8 submatrix for slice 'slice_a_index'.\n", + " auto slice_a_view = tensor_slice_a(cublasdx::slice, cublasdx::slice, slice_a_index);\n", + " auto slice_b_view = tensor_slice_b(cublasdx::slice, cublasdx::slice, slice_b_index);\n", + " auto product_view = tensor_products(cublasdx::slice, cublasdx::slice, product_idx++);\n", + "\n", + " // Configure the device pipelines\n", + " constexpr int pipeline_depth = 3;\n", + " auto const device_pipeline =\n", + " cublasdx::suggest_device_pipeline(slice_a_view, slice_b_view).value();\n", + " auto const shared_memory_size = device_pipeline.buffer_size();\n", + " dim3 const grid_dim(shape_a_rows / static_tile_m(), shape_b_cols / static_tile_n());\n", + "\n", + " // Get the kernel and allow the kernel to use the shared memory required for the device pipeline\n", + " auto kernel = igemm_kernel;\n", + " CUDA_CHECK_AND_EXIT(\n", + " cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));\n", + "\n", + " kernel<<>>(device_pipeline,\n", + " product_view);\n", + " CUDA_CHECK_AND_EXIT(cudaGetLastError());\n", + " }\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e762f82-d2ea-4506-812a-a877747ae187", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2a_unfused_emulation/epilogue_config.hpp.inc\n", + " // Sets the block dimensions for the epilogue kernel\n", + " constexpr int epilogue_kernel_tile_m = 16;\n", + " constexpr int epilogue_kernel_tile_n = 16;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03cedf29-050d-481f-b2fb-0d53a7af15e8", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2a_unfused_emulation/epilogue_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(BlockSize, 1) __global__ void epilogue_kernel(double alpha,\n", + " double beta,\n", + " ProductTensor product_tensor,\n", + " ShiftTensorA shift_tensor_a,\n", + " ShiftTensorB shift_tensor_b,\n", + " OutTensor out_tensor) {\n", + " using product_datatype = tutorial::tensor_value_type_t;\n", + " using shift_datatype = tutorial::tensor_value_type_t;\n", + " using out_datatype = tutorial::tensor_value_type_t;\n", + "\n", + " const auto tid_m = threadIdx.x + blockIdx.x * blockDim.x;\n", + " const auto tid_n = threadIdx.y + blockIdx.y * blockDim.y;\n", + "\n", + " int shift_a = shift_tensor_a(tid_m);\n", + " int shift_b = shift_tensor_b(tid_n);\n", + "\n", + " auto product_view = product_tensor(tid_m, tid_n, cublasdx::slice);\n", + "\n", + " int product_id = 0;\n", + " double accumulator = 0.0;\n", + "\n", + " /*\n", + " * EXERCISE --> Complete the implementation of the epilogue kernel. This kernel:\n", + " * 1. Accumulates along the anti-diagonals into diag_acc\n", + " * 2. Calls nth_slice_to_fp64 to convert back to fp64 and scale the exponent\n", + " * 3. Implements a typical GEMM epilogue (alpha * accumulator + beta * C)\n", + " */\n", + "\n", + " constexpr auto initial_diag = ;\n", + " constexpr auto initial_term = ;\n", + " \n", + " for (auto diag = initial_diag; /* EXERCISE --> loop over diagonals */) {\n", + " product_datatype diag_acc = 0;\n", + " for (auto term = initial_term; /* EXERCISE --> loop along the diagonal */) {\n", + " diag_acc += product_view(product_id++);\n", + " }\n", + "\n", + " // HINT: Be careful here, the most significant diagonal is the 0th diagonal\n", + " accumulator += nth_slice_to_fp64(diag, diag_acc, shift_a + shift_b);\n", + " }\n", + "\n", + " out_tensor(tid_m, tid_n) = alpha * result + beta * out_tensor(tid_m, tid_n);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8021ad97-4a44-4340-8b14-61888237dea4", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build ./build -t 2a_unfused_emulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59c7bca6-d9dc-4c9a-b705-2fc412c0fd69", + "metadata": {}, + "outputs": [], + "source": [ + "!./build/2a_unfused_emulation" + ] + }, + { + "cell_type": "markdown", + "id": "18da3767-beae-4f4e-b18c-ae7d715227db", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "markdown", + "id": "ae68e7b4-5f4f-4655-a5ba-5a91c726194f", + "metadata": {}, + "source": [ + "We will rewrite kernel now and recompile the solution. If you want to restart your exercise make sure you rewrite kernel back and recompile it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8d6416b-8d92-4ea6-8211-0e679d30af3b", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2a_unfused_emulation/slice_coordination.hpp.inc\n", + "\n", + " int product = 0;\n", + "\n", + " /*\n", + " * EXERCISE --> Coordinate the products between slices of matrix A and slices of matrix B\n", + " * NOTE that tensor_slice_a is shaped like (m, k, nslices) and\n", + " * tensor_slice_b is shaped like (k, n, nslices)\n", + " * with strides being (k, 1, m * k) and (1, k, k * n) respectively\n", + " * \n", + " * Compute only the most significant products shown in the diagram above and store them into the\n", + " * tensor_products which has shape (m, n, num_products)\n", + " * with shape (m, n, num_products)\n", + " */\n", + " constexpr auto initial_diag = Slices - 1;\n", + " constexpr auto initial_term = 0;\n", + "\n", + " for (auto diag = initial_diag; diag >= 0; --diag) {\n", + " for (auto term = initial_term; term <= diag; ++term) {\n", + " auto slice_a_view = tensor_slice_a(cublasdx::slice, cublasdx::slice, term);\n", + " auto slice_b_view = tensor_slice_b(cublasdx::slice, cublasdx::slice, diag - term);\n", + " auto product_view = tensor_products(cublasdx::slice, cublasdx::slice, product++);\n", + "\n", + " constexpr int pipeline_depth = 3;\n", + " auto const device_pipeline =\n", + " cublasdx::suggest_device_pipeline(slice_a_view, slice_b_view).value();\n", + " auto const shared_memory_size = device_pipeline.buffer_size();\n", + " dim3 const grid_dim(shape_a_rows / static_tile_m(), shape_b_cols / static_tile_n());\n", + "\n", + " auto kernel = igemm_kernel;\n", + " CUDA_CHECK_AND_EXIT(\n", + " cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));\n", + "\n", + " kernel<<>>(device_pipeline,\n", + " product_view);\n", + " CUDA_CHECK_AND_EXIT(cudaGetLastError());\n", + " }\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04ecadf8-0e30-486d-b25f-5ea4732fda43", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2a_unfused_emulation/epilogue_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(BlockSize, 1) __global__ void epilogue_kernel(double alpha,\n", + " double beta,\n", + " ProductTensor product_tensor,\n", + " ShiftTensorA shift_tensor_a,\n", + " ShiftTensorB shift_tensor_b,\n", + " OutTensor out_tensor) {\n", + " using product_datatype = tutorial::tensor_value_type_t;\n", + " using shift_datatype = tutorial::tensor_value_type_t;\n", + " using out_datatype = tutorial::tensor_value_type_t;\n", + "\n", + " const auto tid_m = threadIdx.x + blockIdx.x * blockDim.x;\n", + " const auto tid_n = threadIdx.y + blockIdx.y * blockDim.y;\n", + "\n", + " int shift_a = shift_tensor_a(tid_m);\n", + " int shift_b = shift_tensor_b(tid_n);\n", + "\n", + " auto product_view = product_tensor(tid_m, tid_n, cublasdx::slice);\n", + "\n", + " int product_id = 0;\n", + " double result = 0.0;\n", + "\n", + " constexpr auto initial_diag = Slices - 1;\n", + " constexpr auto initial_term = 0;\n", + "\n", + " for (auto diag = initial_diag; diag >= 0; --diag) {\n", + " product_datatype diag_acc = 0;\n", + " for (auto term = initial_term; term <= diag; ++term) {\n", + " diag_acc += product_view(product_id++);\n", + " }\n", + "\n", + " result += nth_slice_to_fp64(diag, diag_acc, shift_a + shift_b);\n", + " }\n", + "\n", + " out_tensor(tid_m, tid_n) = alpha * result + beta * out_tensor(tid_m, tid_n);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ff19100-4bee-4008-988e-1f57d4a885ce", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build ./build -t 2a_unfused_emulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e2a3521-4169-483f-a931-4a3516069955", + "metadata": {}, + "outputs": [], + "source": [ + "!./build/2a_unfused_emulation" + ] + }, + { + "cell_type": "markdown", + "id": "ddcd0bd3-0591-4dca-9803-94f5a7aa84a7", + "metadata": {}, + "source": [ + "### Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8452cfba-e563-47fb-b210-f2e765b28867", + "metadata": {}, + "outputs": [], + "source": [ + "problems = [\n", + " (2048, 2048, 2048, 0.9, 1.1),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1271065b-2582-4d1a-805e-4e857f1f108f", + "metadata": {}, + "outputs": [], + "source": [ + "def get_igemm_kernel(BLAS):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " A_SIZE = BLAS.suggest_layout_smem_a().cosize\n", + " B_SIZE = BLAS.suggest_layout_smem_b().cosize\n", + " C_SIZE = BLAS.suggest_layout_rmem_c().cosize\n", + "\n", + " TILE_M, TILE_N = BLAS.c_dim\n", + " TILE_K = BLAS.a_dim[1]\n", + " BLOCK_SIZE = BLAS.block_size\n", + " ALIGNMENT = min(BLAS.alignment.a, min(BLAS.alignment.b, BLAS.alignment.c))\n", + " \n", + " @cuda.jit(extensions=pipeline_extensions, launch_bounds=(BLOCK_SIZE, 1))\n", + " def igemm_kernel(tensor_c, device_pipeline: DevicePipeline):\n", + " m, n = tensor_c.shape\n", + "\n", + " ldc = max(tensor_c.strides) // tensor_c.itemsize\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + "\n", + " smem = cuda.shared.array(shape=(0,), dtype=BLAS.a_value_type, alignment=ALIGNMENT)\n", + "\n", + " block_start_m = block_m * TILE_M\n", + " block_end_m = (block_m + 1) * TILE_M\n", + "\n", + " block_start_n = block_n * TILE_N\n", + " block_end_n = (block_n + 1) * TILE_N\n", + "\n", + " if block_start_m >= m or block_start_n >= n:\n", + " return\n", + " \n", + " c_view = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " ]\n", + "\n", + " gmem_c = make_tensor(c_view, BLAS.get_layout_gmem_c(ldc))\n", + " \n", + " tile_pipeline = device_pipeline.get_tile(smem, block_m, block_n)\n", + " \n", + " accumulator = BLAS.suggest_accumulator()\n", + " tile_pipeline.execute(accumulator)\n", + "\n", + " if accumulator.is_thread_active():\n", + " accumulator.partition_and_copy(accumulator.get_results(), gmem_c)\n", + "\n", + " tile_pipeline._del()\n", + "\n", + " return igemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6313ead1-b6ec-4068-94a4-847e9f747cb6", + "metadata": {}, + "outputs": [], + "source": [ + "def unfused_igemm_ozaki(tensor_slicedA, tensor_slicedB, tensor_product, context, warmup=True):\n", + " BLAS = context[\"BLAS\"]\n", + " pipeline_depth = context[\"PIPELINE_DEPTH\"]\n", + " igemm_kernel = context[\"gemm_kernel\"]\n", + " grid = context[\"gemm_grid\"]\n", + " block = context[\"gemm_block\"]\n", + "\n", + " _, _, slices = tensor_slicedA.shape\n", + "\n", + " product_index = 0\n", + " \n", + " \"\"\"\n", + " EXERCISE --> Coordinate the products between slices of matrix A and slices of matrix B\n", + " NOTE that tensor_slice_a is shaped like (m, k, nslices) and\n", + " tensor_slice_b is shaped like (k, n, nslices)\n", + " with strides being (k, 1, m * k) and (1, k, k * n) respectively \n", + " Compute only the most significant products and store them into tensor_products\n", + " with shape (m, n, num_products)\n", + " \"\"\"\n", + "\n", + " initial_diag = -1\n", + " initial_term = -1\n", + " \n", + " for diag in range(-1): # EXERCISE --> loop over diagonals\n", + " for term in range(-1): # EXERCISE --> loop along the diagonal\n", + " # EXERCISE --> Determine which slice of A and slice of B to multiply\n", + " slice_a_index = -1\n", + " slice_b_index = -1\n", + " \n", + " # Convert from a cupy array to numba arrays\n", + " # - cupy arrays are needed to setup 3D strides for the pipeline API\n", + " slice_a_view = cuda.as_cuda_array(tensor_slicedA[:, :, slice_a_index])\n", + " slice_b_view = cuda.as_cuda_array(tensor_slicedB[:, :, slice_a_index])\n", + " product_view = cuda.as_cuda_array(tensor_product[:, :, product_index])\n", + "\n", + " product_index += 1\n", + "\n", + " device_pipeline = BLAS.suggest_device_pipeline(pipeline_depth, slice_a_view, slice_b_view)\n", + "\n", + " if warmup and diag == initial_diag and term == initial_term:\n", + " set_max_dynamic_shared_size_bytes(igemm_kernel, device_pipeline.buffer_size,\n", + " product_view, device_pipeline)\n", + "\n", + " igemm_kernel[grid, block, 0, device_pipeline.buffer_size](product_view, device_pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f092b50-00d8-4a50-880e-fb935d74a303", + "metadata": {}, + "outputs": [], + "source": [ + "def get_epilogue_kernel(block_size=64):\n", + " uint8_width = get_width(np.uint8)\n", + "\n", + " @cuda.jit(device=True, forceinline=True)\n", + " def nth_slice_to_fp64(nth, nth_slice, exponent_shift):\n", + " ko = math.pow(2.0, -nth * uint8_width)\n", + "\n", + " value = ko * np.float64(nth_slice)\n", + " return epilogue_ldexp(value, -exponent_shift)\n", + "\n", + " @cuda.jit(launch_bounds=(block_size, 1))\n", + " def epilogue_kernel(slices, tensor_product, tensor_shift_a, tensor_shift_b, tensor_out, alpha, beta):\n", + " tid_m = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n", + " tid_n = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y\n", + "\n", + " if tid_m >= tensor_out.shape[0] or tid_n >= tensor_out.shape[1]:\n", + " return\n", + "\n", + " shift_a = tensor_shift_a[tid_m]\n", + " shift_b = tensor_shift_b[tid_n]\n", + "\n", + " product_view = tensor_product[tid_m, tid_n, :]\n", + "\n", + " product_id = 0\n", + " accumulator = 0.0\n", + "\n", + " \"\"\"\n", + " EXERCISE --> Complete the implementation of the epilogue kernel. This kernel:\n", + " 1. Accumulates along the anti-diagonals into diag_acc\n", + " 2. Calls nth_slice_to_fp64 to convert back to fp64 and scale the exponent\n", + " 3. Implements a typical GEMM epilogue (alpha * accumulator + beta * C)\n", + " \"\"\"\n", + "\n", + " initial_diag = -1\n", + " initial_term = -1\n", + " \n", + " for diag in range(-1): # EXERCISE --> loop over diagonals\n", + " diag_acc = 0\n", + " for term in range(-1): # EXERCISE --> loop along the diagonal\n", + " diag_acc += product_view[product_id]\n", + " product_id += 1\n", + "\n", + " # HINT: Be careful here, the most significant diagonal is the 0th diagonal\n", + " accumulator += nth_slice_to_fp64(diag, diag_acc, shift_a + shift_b)\n", + " \n", + " tensor_out[tid_m, tid_n] = alpha * accumulator + beta * tensor_out[tid_m, tid_n]\n", + "\n", + " return epilogue_kernel\n", + "\n", + "def epilogue(slices, tensor_products, tensor_shift_a, tensor_shift_b, tensor_c, alpha, beta, context):\n", + " epilogue_kernel = context[\"epilogue_kernel\"]\n", + " grid = context[\"epilogue_grid\"]\n", + " block = context[\"epilogue_block\"]\n", + " \n", + " epilogue_kernel[grid, block](slices, tensor_products, tensor_shift_a, tensor_shift_b, tensor_c, alpha, beta)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ba20117-19ab-4bf2-a158-4dcf2af9f12e", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_func(m, n, k):\n", + " tile_m = 128\n", + " tile_n = 128\n", + " tile_k = 128\n", + " pipeline_depth = 3\n", + " block_size = 128\n", + "\n", + " epilogue_tile_m = 16\n", + " epilogue_tile_n = 16\n", + "\n", + " assert m % tile_m == 0, \"Unsupported dimension m for TILE_M\"\n", + " assert n % tile_n == 0, \"Unsupported dimension n for TILE_N\"\n", + " assert k % tile_k == 0, \"Unsupported dimension n for TILE_N\"\n", + " assert k >= (tile_k * pipeline_depth), \"Unsupported pipeline depth for k\"\n", + "\n", + " assert m % epilogue_tile_m == 0, \"Unsupported dimension for EPILOGUE_TILE_M\"\n", + " assert n % epilogue_tile_n == 0, \"Unsupported dimension for EPILOGUE_TILE_N\"\n", + " \n", + " BLAS = Matmul(size=(tile_m, tile_n, tile_k),\n", + " precision=(np.int8, np.int8, np.int32),\n", + " data_type=\"real\",\n", + " alignment=MAX_ALIGNMENT,\n", + " arrangement=(\"row_major\", \"col_major\", \"col_major\"), # Do not change\n", + " execution=\"Block\",\n", + " block_size=block_size,\n", + " with_pipeline=True,\n", + " enable_input_streaming=True,\n", + " static_block_dim=True)\n", + "\n", + " gemm_grid = (m // tile_m, n // tile_n)\n", + " gemm_block = BLAS.block_dim\n", + "\n", + " epilogue_grid = (m // epilogue_tile_m, n // epilogue_tile_n)\n", + " epilogue_block = (epilogue_tile_m, epilogue_tile_n)\n", + "\n", + " return {\n", + " \"BLAS\": BLAS,\n", + " \"PIPELINE_DEPTH\": pipeline_depth,\n", + " \"gemm_kernel\" : get_igemm_kernel(BLAS),\n", + " \"gemm_grid\": gemm_grid,\n", + " \"gemm_block\": gemm_block,\n", + " \"epilogue_kernel\": get_epilogue_kernel(math.prod(epilogue_block)),\n", + " \"epilogue_grid\": epilogue_grid,\n", + " \"epilogue_block\": epilogue_block\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a556d45c-3a73-4f04-ae97-cd7d2dd4661b", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_unfused_emulated_dgemm(problems, setup_func, unfused_igemm_ozaki, epilogue)" + ] + }, + { + "cell_type": "markdown", + "id": "95cf99b3-1603-4170-8828-f37dfbb0353a", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc339107-5ef9-44bc-8dfc-ef262f59a9d7", + "metadata": {}, + "outputs": [], + "source": [ + "def unfused_igemm_ozaki_solution(tensor_slicedA, tensor_slicedB, tensor_product, context, warmup=True):\n", + " BLAS = context[\"BLAS\"]\n", + " pipeline_depth = context[\"PIPELINE_DEPTH\"]\n", + " igemm_kernel = context[\"gemm_kernel\"]\n", + " grid = context[\"gemm_grid\"]\n", + " block = context[\"gemm_block\"]\n", + "\n", + " _, _, slices = tensor_slicedA.shape\n", + "\n", + " product_id = 0\n", + "\n", + " \"\"\"\n", + " EXERCISE --> Coordinate the products between slices of matrix A and slices of matrix B\n", + " NOTE that tensor_slice_a is shaped like (m, k, nslices) and\n", + " tensor_slice_b is shaped like (k, n, nslices)\n", + " with strides being (k, 1, m * k) and (1, k, k * n) respectively\n", + " \n", + " Compute only the most significant products shown in the diagram above and store them into the\n", + " tensor_products which has shape (m, n, num_products)\n", + " with shape (m, n, num_products)\n", + " \"\"\"\n", + " initial_diag = slices - 1\n", + " initial_term = 0\n", + " \n", + " for diag in range(initial_diag, -1, -1):\n", + " for term in range(diag + 1):\n", + " slice_a = term\n", + " slice_b = diag - term\n", + "\n", + " # Convert from a cupy array to numba arrays\n", + " # - cupy arrays are needed to setup 3D strides for the pipeline API\n", + " slice_a_view = cuda.as_cuda_array(tensor_slicedA[:, :, slice_a])\n", + " slice_b_view = cuda.as_cuda_array(tensor_slicedB[:, :, slice_b])\n", + " product_view = cuda.as_cuda_array(tensor_product[:, :, product_id])\n", + "\n", + " device_pipeline = BLAS.suggest_device_pipeline(pipeline_depth, slice_a_view, slice_b_view)\n", + "\n", + " if warmup and diag == initial_diag and term == initial_term:\n", + " set_max_dynamic_shared_size_bytes(igemm_kernel, device_pipeline.buffer_size,\n", + " product_view, device_pipeline)\n", + "\n", + " igemm_kernel[grid, block, 0, device_pipeline.buffer_size](product_view, device_pipeline)\n", + " \n", + " product_id += 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77669dc6-000d-446b-b009-9aebb349a564", + "metadata": {}, + "outputs": [], + "source": [ + "def get_epilogue_kernel_solution(block_size=64):\n", + " uint8_width = get_width(np.uint8)\n", + "\n", + " @cuda.jit(device=True, forceinline=True)\n", + " def nth_slice_to_fp64(nth, nth_slice, exponent_shift):\n", + " ko = math.pow(2.0, -nth * uint8_width)\n", + "\n", + " value = ko * np.float64(nth_slice)\n", + " return epilogue_ldexp(value, -exponent_shift)\n", + "\n", + " @cuda.jit(launch_bounds=(block_size, 1))\n", + " def epilogue_kernel(slices, tensor_product, tensor_shift_a, tensor_shift_b, tensor_out, alpha, beta):\n", + " tid_m = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n", + " tid_n = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y\n", + "\n", + " if tid_m >= tensor_out.shape[0] or tid_n >= tensor_out.shape[1]:\n", + " return\n", + "\n", + " shift_a = tensor_shift_a[tid_m]\n", + " shift_b = tensor_shift_b[tid_n]\n", + "\n", + " product_view = tensor_product[tid_m, tid_n, :]\n", + "\n", + " product_id = 0\n", + " result = 0.0\n", + "\n", + " initial_diag = slices - 1\n", + " initial_term = 0\n", + " \n", + " for diag in range(initial_diag, -1, -1):\n", + " diag_acc = 0\n", + " for term in range(diag + 1):\n", + " diag_acc += product_view[product_id]\n", + " product_id += 1\n", + "\n", + " result += nth_slice_to_fp64(diag, diag_acc, shift_a + shift_b)\n", + "\n", + " if beta != 0:\n", + " result = alpha * result + beta * tensor_out[tid_m, tid_n]\n", + " else:\n", + " result = alpha * result\n", + "\n", + " tensor_out[tid_m, tid_n] = result\n", + "\n", + " return epilogue_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e5af2b7-c662-440d-abc9-e7a351efbba3", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_func_solution(m, n, k):\n", + " ctx = setup_func(m, n, k)\n", + " epilogue_block = ctx[\"epilogue_block\"]\n", + " ctx[\"epilogue_kernel\"] = get_epilogue_kernel_solution(math.prod(epilogue_block))\n", + " \n", + " return ctx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b97126e3-103b-4b10-a44a-3c8e7875c46f", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_unfused_emulated_dgemm(problems, setup_func_solution, unfused_igemm_ozaki_solution, epilogue)" + ] + }, + { + "cell_type": "markdown", + "id": "0f76d1a6-d28c-4664-a937-010bc663766a", + "metadata": {}, + "source": [ + "### Performance Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fb09f77-4bd5-4b92-bad6-dbc9664a5361", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import math\n", + "\n", + "# INT8 TOPS, MEMORY BANDWIDTH (GB/s)\n", + "GPU_SPECS = {\n", + " \"L40S\": (733, 864),\n", + " \"B200\": (4500, 8000)\n", + "}\n", + "\n", + "# NOTE: This model is very simplistic and does not take quantization or other overheads like slicing and FP64 operations into account\n", + "def roofline_prediction_3_1(m, n, k, slices=7, TILE_M=128, TILE_N=128, TILE_K=128):\n", + " INT8_TOPS, MEMORY_BANDWIDTH_GBS = GPU_SPECS[\"L40S\"]\n", + "\n", + " num_products = (slices * (slices + 1)) // 2\n", + "\n", + " # By design since each thread is computing one output element\n", + " tiles = math.ceil(m / TILE_M) * math.ceil(n / TILE_N)\n", + "\n", + " # Each tile does TILE_M * TILE_N dot products which each have k multiplications and k additions for every product\n", + " flops_per_tile = 2 * TILE_M * TILE_N * k * num_products\n", + "\n", + " fp64_size = np.dtype(np.float64).itemsize\n", + " int32_size = np.dtype(np.float64).itemsize\n", + " int8_size = np.dtype(np.int8).itemsize\n", + "\n", + " # We load a TILE_M rows of matrix A, TILE_N columns of matrix B, and write to TILE_M * TILE_N elements of matrix C\n", + " # This needs to happen for each product\n", + " memory_per_tile = ((TILE_M * k + TILE_N * k) * int8_size + TILE_M * TILE_N * int32_size) * num_products\n", + "\n", + " # In the epilogue kernel, we load the products and read from and write to the output\n", + " memory_per_tile += (TILE_M * TILE_N) * (num_products * int32_size + 2 * fp64_size)\n", + "\n", + " total_memory_gb = tiles * memory_per_tile * 1e-9\n", + " total_tflop = tiles * flops_per_tile * 1e-12\n", + "\n", + " return total_tflop / INT8_TOPS, total_memory_gb / MEMORY_BANDWIDTH_GBS\n", + "\n", + "time_flops, time_membw = roofline_prediction_3_1(2048, 2048, 2048)\n", + "\n", + "print(f\"The runtime from the math operations {time_flops * 1e3} ms and the runtime from memory is {time_membw * 1e3} ms\")\n", + "\n", + "# We will either be bottlenecked by FLOPS or Memory Bandwidth, so we take the maximum\n", + "print(f\"Therefore, the estimated best case runtime is {max(time_flops, time_membw) * 1e3} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "08cc68c7-c9d9-4bab-9665-6ade4fc8402d", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this notebook we've learned the fundamentals of the Ozaki-I Scheme and built an implemenation that we will optimize in the next exercises.\n", + "\n", + "We then analyzed why the Ozaki scheme makes sense by building a simplistic model for the product gemms and epilogue kernel. In the next exercise, we will implement kernel fusion to reduce memory overhead and speedup the process." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/03.02-PartiallyFusedEmulation.ipynb b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/03.02-PartiallyFusedEmulation.ipynb new file mode 100644 index 00000000..62a13187 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/03.02-PartiallyFusedEmulation.ipynb @@ -0,0 +1,953 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "24069346-2c9c-4d6b-b812-d59f2804e36c", + "metadata": {}, + "source": [ + "## Exercise 3.2: Optimizing the Ozaki-I Scheme with kernel fusion\n", + "\n", + "Now that we have a working Ozaki-I implementation, we'd like to start fusing the epilogue function into our emulated GEMM kernel. This will help reduce the global memory traffic.\n", + "\n", + "In this exercise, we will start this by computing all slice products and accumulate the anti-diagonals in one kernel:\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "2407f8fb-09df-4f4e-8caa-e1c348ecf307", + "metadata": {}, + "source": [ + "### C++ Cmake Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4323a0a4-ca7d-4069-b855-328bfede2d0c", + "metadata": {}, + "outputs": [], + "source": [ + "import sys, os\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "from common_cuda import setup_cmake_project\n", + "setup_cmake_project()" + ] + }, + { + "cell_type": "markdown", + "id": "3c5c4e89-5b9e-4783-a904-f93aa5b07055", + "metadata": {}, + "source": [ + "### Python Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7e19123-19f9-42cb-a05c-a635497e834b", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "import math\n", + "\n", + "import numpy as np\n", + "import cupy as cp\n", + "import nvmath\n", + "\n", + "from nvmath.device import Matmul\n", + "from nvmath.device.cublasdx import DevicePipeline, SharedStorageCalc, MAX_ALIGNMENT\n", + "from nvmath.device.cublasdx_numba import pipeline_extensions\n", + "from nvmath.device.common import axpby, clear, copy, copy_fragment, copy_wait, make_tensor\n", + "from numba import cuda\n", + "\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "\n", + "from benchmark import *\n", + "from emulation_utils import get_width, epilogue_ldexp" + ] + }, + { + "cell_type": "markdown", + "id": "e0b869c3-d014-4001-9673-612dc6c35de7", + "metadata": {}, + "source": [ + "### C++" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddac7a49-07ab-4337-979c-aa9e121902f0", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2b_partially_fused_emulation/parameters.hpp.inc\n", + "\n", + " // ===================================\n", + " // Problem configuration\n", + " // ===================================\n", + "\n", + " // (gemm_m, gemm_n, gemm_k, alpha, beta)\n", + " std::vector problems = {\n", + " {2048, 2048, 2048, 0.9, 1.1}\n", + " };\n", + " \n", + "\n", + " // ===================================\n", + " // Global GEMM configuration\n", + " // ===================================\n", + "\n", + " // The number of slices used in emulation algorithm\n", + " // More slices = higher precision but more computation\n", + " constexpr unsigned slices = 7;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9545479-9d2f-4130-b410-173f9cf87943", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2b_partially_fused_emulation/cublasdx_config.hpp.inc\n", + "\n", + " using slice_value_type = int8_t; // Precision for individual slices\n", + " using accumulator_value_type = int32_t; // Precision for accumulation\n", + "\n", + " // The shape of data tile processed by a single CTA block\n", + " constexpr int tile_m = 128;\n", + " constexpr int tile_n = 128;\n", + " constexpr int tile_k = 128;\n", + "\n", + " // The shape of CTA block (number of threads)\n", + " constexpr int cta_shape_x = 128;\n", + " constexpr int cta_shape_y = 1;\n", + " constexpr int cta_shape_z = 1;\n", + "\n", + " using BLAS = decltype(cublasdx::Size() +\n", + " cublasdx::Precision() +\n", + " cublasdx::Type() + cublasdx::Function() +\n", + " cublasdx::Arrangement() + cublasdx::Block() +\n", + " cublasdx::BlockDim() + cublasdx::StaticBlockDim() +\n", + " cublasdx::WithPipeline() + cublasdx::MaxAlignment() + cublasdx::EnableInputStreaming() +\n", + " cublasdx::SM());" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a0f84-9477-4294-b59b-f4d806411cba", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2b_partially_fused_emulation/pipeline_config.hpp.inc\n", + "\n", + " constexpr int pipeline_depth = 3;\n", + " auto device_pipeline = cublasdx::suggest_device_pipeline(\n", + " tensor_slice_a, tensor_slice_b)\n", + " .value();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1013bf66-8ad0-441d-b86e-694588047e84", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2b_partially_fused_emulation/fused_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__\n", + " void fused_epilogue_kernel(__grid_constant__ DevicePipeline const device_pipeline,\n", + " SliceProductTensor slice_product_tensor) {\n", + " extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[];\n", + "#ifdef __CUDA_ARCH__\n", + " /* \n", + " * EXERCISE --> Complete the kernel to compute all products and accumulate along diagonals in the same kernel\n", + " */\n", + "\n", + " if constexpr (cublasdx::sm_of_v == __CUDA_ARCH__) {\n", + " // ================================\n", + " // 1. SETUP AND TILE PREPARATION\n", + " // ================================\n", + "\n", + " // EXERCISE --> Choose your starting diagonal and term along the diagonal\n", + " constexpr auto initial_diag = ;\n", + " constexpr auto initial_term = ;\n", + "\n", + " // Get pipeline tile\n", + " auto tile_pipeline = device_pipeline.get_tile(\n", + " smem, cublasdx::make_coord(blockIdx.x, initial_term), cublasdx::make_coord(blockIdx.y, initial_diag));\n", + "\n", + " auto accumulator = tile_pipeline.get_accumulator();\n", + "\n", + " // ============================================\n", + " // 2. OZAKI SCHEME DIAGONAL ITERATION\n", + " // ============================================\n", + "# pragma unroll 1\n", + " for (int diag = initial_diag; /* EXERCISE --> for loop over diagonals */) {\n", + "\n", + " // Initialize accumulator for this diagonal\n", + " accumulator.clear();\n", + "\n", + " // ==========================================\n", + " // 3. SLICE COMBINATION COMPUTATION\n", + " // ==========================================\n", + "# pragma unroll 1\n", + " for (int term = initial_term; /* EXERCISE --> for loop to iterate along the diagonal */) {\n", + " // =========================================\n", + " // 4. N-STAGE MEMORY PIPELINE FOR GEMM\n", + " // =========================================\n", + "\n", + " tile_pipeline.execute(accumulator);\n", + "\n", + " // EXERCISE --> Determine which slice of A and slice of B to multiply\n", + " const auto next_slice_row = ;\n", + " const auto next_slice_col = ;\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " cublasdx::make_coord(blockIdx.x, next_slice_row),\n", + " cublasdx::make_coord(blockIdx.y, next_slice_col));\n", + " }\n", + "\n", + " // ========================================\n", + " // 5. RESULT RECONSTRUCTION AND EPILOGUE\n", + " // ========================================\n", + "\n", + " if (accumulator.is_thread_active()) {\n", + " // Choose output tensor for this slice iteration\n", + " auto this_slice_output = slice_product_tensor(cublasdx::slice, cublasdx::slice, diag);\n", + " // Get output tile for this block\n", + " auto slice_output_tile = cublasdx::get_tile(this_slice_output, BLAS::c_shape, blockIdx.x, blockIdx.y);\n", + " // Store results\n", + " accumulator.partition_and_store(slice_output_tile);\n", + " }\n", + " }\n", + " }\n", + "#endif\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e67d564-a5a2-4ba7-bc7c-3f983b12183d", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2b_partially_fused_emulation/epilogue_config.hpp.inc\n", + "\n", + " constexpr int epilogue_kernel_tile_m = 16;\n", + " constexpr int epilogue_kernel_tile_n = 16;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa6f3a54-bab2-45e1-9459-2570b8d65b12", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2b_partially_fused_emulation/epilogue_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(BlockSize, 1) __global__ void epilogue_kernel(double alpha,\n", + " double beta,\n", + " ProductTensor product_tensor,\n", + " ShiftTensorA shift_tensor_a,\n", + " ShiftTensorB shift_tensor_b,\n", + " OutTensor out_tensor) {\n", + " using product_datatype = tutorial::tensor_value_type_t;\n", + " using shift_datatype = tutorial::tensor_value_type_t;\n", + " using out_datatype = tutorial::tensor_value_type_t;\n", + "\n", + " const auto tid_m = threadIdx.x + blockIdx.x * blockDim.x;\n", + " const auto tid_n = threadIdx.y + blockIdx.y * blockDim.y;\n", + "\n", + " int shift_a = shift_tensor_a(tid_m);\n", + " int shift_b = shift_tensor_b(tid_n);\n", + "\n", + " /*\n", + " * EXERCISE --> Complete the implementation of the epilogue kernel\n", + " */\n", + " #pragma unroll\n", + " for (/* for loop over diagonals */) {\n", + " product_datatype diag_acc = product_tensor(tid_m, tid_n, diag);\n", + " result += nth_slice_to_fp64(diag, diag_acc, shift_a + shift_b);\n", + " }\n", + "\n", + " out_tensor(tid_m, tid_n) = alpha * result + beta * out_tensor(tid_m, tid_n);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd7f8cd9-cc63-44e0-90fd-abd1b81abdcb", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build ./build -t 2b_partially_fused_emulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cdaa3e0-a583-4f99-926f-5ddc45c15a46", + "metadata": {}, + "outputs": [], + "source": [ + "!./build/2b_partially_fused_emulation" + ] + }, + { + "cell_type": "markdown", + "id": "24835933-3e6d-4cdd-b2b2-4fa9796d9a1e", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "markdown", + "id": "e2f67f87-240b-441a-b267-8a6c07b9a569", + "metadata": {}, + "source": [ + "We will rewrite kernel now and recompile the solution. If you want to restart your exercise make sure you rewrite kernel back and recompile it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbfa1426-9438-43ec-843d-2f086d31ba3a", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2b_partially_fused_emulation/fused_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__\n", + " void fused_epilogue_kernel(__grid_constant__ DevicePipeline const device_pipeline,\n", + " SliceProductTensor slice_product_tensor) {\n", + " extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[];\n", + "#ifdef __CUDA_ARCH__\n", + " if constexpr (cublasdx::sm_of_v == __CUDA_ARCH__) {\n", + " // ================================\n", + " // 1. SETUP AND TILE PREPARATION\n", + " // ================================\n", + "\n", + " constexpr auto initial_diag = Slices - 1;\n", + " constexpr auto initial_term = 0;\n", + "\n", + " // Get pipeline tile\n", + " auto tile_pipeline = device_pipeline.get_tile(\n", + " smem, cublasdx::make_coord(blockIdx.x, initial_term), cublasdx::make_coord(blockIdx.y, initial_diag));\n", + "\n", + " auto accumulator = tile_pipeline.get_accumulator();\n", + "\n", + " // ============================================\n", + " // 2. OZAKI SCHEME DIAGONAL ITERATION\n", + " // ============================================\n", + "\n", + " // Iterate over diagonals in reverse order (highest power of 2 first)\n", + " // This ensures proper accumulation order for numerical stability\n", + "# pragma unroll 1\n", + " for (auto diag = initial_diag; diag >= 0; --diag) {\n", + "\n", + " // Initialize accumulator for this diagonal\n", + " accumulator.clear();\n", + "\n", + " // ==========================================\n", + " // 3. SLICE COMBINATION COMPUTATION\n", + " // ==========================================\n", + "\n", + " // Compute all slice combinations that contribute to this diagonal\n", + " // For diagonal d, we compute: A_slice[i] * B_slice[d-i] for i = 0 to d\n", + "# pragma unroll 1\n", + " for (auto term = initial_term; term <= diag; ++term) {\n", + " // =========================================\n", + " // 4. N-STAGE MEMORY PIPELINE FOR GEMM\n", + " // =========================================\n", + "\n", + " tile_pipeline.execute(accumulator);\n", + "\n", + " const auto next_slice_row = (term == diag) ? 0 : term + 1; // A slice index\n", + " const auto next_slice_col = (term == diag) ? (diag - 1) : (diag - next_slice_row); // B slice index\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " cublasdx::make_coord(blockIdx.x, next_slice_row),\n", + " cublasdx::make_coord(blockIdx.y, next_slice_col));\n", + " } /* end of slice combination loop */\n", + "\n", + " // ========================================\n", + " // 5. RESULT RECONSTRUCTION AND EPILOGUE\n", + " // ========================================\n", + "\n", + " if (accumulator.is_thread_active()) {\n", + " // Choose output tensor for this slice iteration\n", + " auto this_slice_output = slice_product_tensor(cublasdx::slice, cublasdx::slice, diag);\n", + " // Get output tile for this block\n", + " auto slice_output_tile = cublasdx::get_tile(this_slice_output, BLAS::c_shape, blockIdx.x, blockIdx.y);\n", + " // Store results\n", + " accumulator.partition_and_store(slice_output_tile);\n", + " }\n", + " }\n", + " }\n", + "#endif\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2facdcf3-6781-4990-ab5b-74577daf9148", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2b_partially_fused_emulation/epilogue_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(BlockSize, 1) __global__ void epilogue_kernel(double alpha,\n", + " double beta,\n", + " ProductTensor product_tensor,\n", + " ShiftTensorA shift_tensor_a,\n", + " ShiftTensorB shift_tensor_b,\n", + " OutTensor out_tensor) {\n", + " using product_datatype = tutorial::tensor_value_type_t;\n", + " using shift_datatype = tutorial::tensor_value_type_t;\n", + " using out_datatype = tutorial::tensor_value_type_t;\n", + "\n", + " const auto tid_m = threadIdx.x + blockIdx.x * blockDim.x;\n", + " const auto tid_n = threadIdx.y + blockIdx.y * blockDim.y;\n", + "\n", + " int shift_a = shift_tensor_a(tid_m);\n", + " int shift_b = shift_tensor_b(tid_n);\n", + "\n", + " auto product_view = product_tensor(tid_m, tid_n, cublasdx::slice);\n", + "\n", + " double result = 0.0;\n", + "\n", + "#pragma unroll\n", + " for (auto diag = Slices-1; diag >= 0; diag--) {\n", + " product_datatype diag_acc = product_tensor(tid_m, tid_n, diag);\n", + " result += nth_slice_to_fp64(diag, diag_acc, shift_a + shift_b);\n", + " }\n", + "\n", + " out_tensor(tid_m, tid_n) = alpha * result + beta * out_tensor(tid_m, tid_n);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa5dc237-0b70-4489-93da-50d19b77c872", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build ./build -t 2b_partially_fused_emulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "844f2234-8522-44da-935f-57825c4a85e8", + "metadata": {}, + "outputs": [], + "source": [ + "!./build/2b_partially_fused_emulation" + ] + }, + { + "cell_type": "markdown", + "id": "2797408b-db37-4676-9b5f-ae982a5f0165", + "metadata": {}, + "source": [ + "### Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f340d7f-c138-4a09-a0f5-27b197a13641", + "metadata": {}, + "outputs": [], + "source": [ + "problems = [\n", + " (2048, 2048, 2048, 0.9, 1.1),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38624e49-3518-43e7-900b-9f84ee91a7d3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_emulated_gemm_kernel(BLAS):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " tile_m, tile_n = BLAS.c_dim\n", + " \n", + " @cuda.jit(extensions=pipeline_extensions, launch_bounds=(BLAS.block_size, 1))\n", + " def gemm_kernel(tensor_c, device_pipeline: DevicePipeline):\n", + " _, _, slices = tensor_c.shape\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + "\n", + " smem = cuda.shared.array(shape=(0,), dtype=BLAS.a_value_type, alignment=device_pipeline.buffer_alignment)\n", + "\n", + " block_start_m = block_m * tile_m\n", + " block_end_m = (block_m + 1) * tile_m\n", + "\n", + " block_start_n = block_n * tile_n\n", + " block_end_n = (block_n + 1) * tile_n\n", + " \n", + " # EXERCISE --> Complete the kernel to compute all products and accumulate along diagonals in the same kernel\n", + "\n", + " # ================================\n", + " # 1. SETUP AND TILE PREPARATION\n", + " # ================================\n", + "\n", + " # EXERCISE --> Choose your starting diagonal and term along the diagonal\n", + " initial_diag = -1\n", + " initial_term = -1\n", + "\n", + " # Get pipeline tile\n", + " tile_pipeline = device_pipeline.get_tile(smem,\n", + " (block_m, np.int32(initial_term)),\n", + " (block_n, np.int32(initial_diag)))\n", + " \n", + " accumulator = BLAS.suggest_accumulator()\n", + "\n", + " c_views = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " :\n", + " ]\n", + " ldc = max(c_views.strides[:2]) // c_views.itemsize\n", + " \n", + " # ============================================\n", + " # 2. OZAKI SCHEME DIAGONAL ITERATION\n", + " # ============================================\n", + " for diag in range(-1): # EXERCISE --> for loop over diagonals\n", + "\n", + " # Initialize accumulator for this diagonal\n", + " accumulator.clear()\n", + "\n", + " # ==========================================\n", + " # 3. SLICE COMBINATION COMPUTATION\n", + " # ==========================================\n", + " for term in range(-1): # EXERCISE --> for loop to iterate along the diagonal\n", + " # =========================================\n", + " # 4. N-STAGE MEMORY PIPELINE FOR GEMM\n", + " # =========================================\n", + " tile_pipeline.execute(accumulator)\n", + "\n", + " # EXERCISE --> Determine which slice of A and slice of B to multiply\n", + " next_slice_row = -1\n", + " next_slice_col = -1\n", + "\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " (block_m, np.int32(next_slice_row)),\n", + " (block_n, np.int32(next_slice_col)))\n", + "\n", + " # ========================================\n", + " # 5. RESULT RECONSTRUCTION AND EPILOGUE\n", + " # ========================================\n", + " if accumulator.is_thread_active():\n", + " gmem_c = make_tensor(c_views[:,:,diag], BLAS.get_layout_gmem_c(ldc))\n", + " accumulator.partition_and_copy(accumulator.get_results(), gmem_c)\n", + "\n", + " # tile_pipeline._del()\n", + "\n", + " return gemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdc17a5c-d199-4979-8e16-309eebc31824", + "metadata": {}, + "outputs": [], + "source": [ + "def partial_fused_dgemm_ozaki(tensor_slicedA_cupy, tensor_slicedB_cupy, tensor_diag_cupy, context, warmup=True):\n", + " BLAS = context[\"BLAS\"]\n", + " pipeline_depth = context[\"PIPELINE_DEPTH\"]\n", + " gemm_kernel = context[\"gemm_kernel\"]\n", + " grid = context[\"gemm_grid\"]\n", + " block = context[\"gemm_block\"]\n", + "\n", + " tensor_slicedA = cuda.as_cuda_array(tensor_slicedA_cupy)\n", + " tensor_slicedB = cuda.as_cuda_array(tensor_slicedB_cupy)\n", + " tensor_diag = cuda.as_cuda_array(tensor_diag_cupy)\n", + "\n", + " device_pipeline = BLAS.suggest_device_pipeline(pipeline_depth, tensor_slicedA, tensor_slicedB)\n", + "\n", + " if warmup:\n", + " set_max_dynamic_shared_size_bytes(gemm_kernel, device_pipeline.buffer_size,\n", + " tensor_diag, device_pipeline)\n", + " gemm_kernel[grid, block, 0, device_pipeline.buffer_size](tensor_diag, device_pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2830fd6c-c727-47e2-870e-e94932633e3e", + "metadata": {}, + "outputs": [], + "source": [ + "def get_epilogue_kernel(block_size=64):\n", + " uint8_width = get_width(np.uint8)\n", + "\n", + " @cuda.jit(device=True, forceinline=True)\n", + " def nth_slice_to_fp64(nth, nth_slice, exponent_shift):\n", + " ko = math.pow(2.0, -nth * uint8_width)\n", + "\n", + " value = ko * np.float64(nth_slice)\n", + " return epilogue_ldexp(value, -exponent_shift)\n", + "\n", + " @cuda.jit(launch_bounds=(block_size, 1))\n", + " def epilogue_kernel(slices, tensor_diag, tensor_shift_a, tensor_shift_b, tensor_out, alpha, beta):\n", + " tid_m = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n", + " tid_n = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y\n", + "\n", + " if tid_m >= tensor_out.shape[0] or tid_n >= tensor_out.shape[1]:\n", + " return\n", + "\n", + " shift_a = tensor_shift_a[tid_m]\n", + " shift_b = tensor_shift_b[tid_n]\n", + " \n", + " # EXERCISE --> Complete the implementation of the epilogue kernel\n", + " diag_view = tensor_diag[tid_m, tid_n, :]\n", + "\n", + " result = 0.0\n", + " for diag in range(-1): # EXERCISE --> loop over diagonals\n", + " result += nth_slice_to_fp64(diag, diag_view[diag], shift_a + shift_b)\n", + "\n", + " tensor_out[tid_m, tid_n] = alpha * result + beta * tensor_out[tid_m, tid_n]\n", + "\n", + " return epilogue_kernel\n", + "\n", + "def epilogue(slices, tensor_products, tensor_shift_a, tensor_shift_b, tensor_c, alpha, beta, context):\n", + " epilogue_kernel = context[\"epilogue_kernel\"]\n", + " \n", + " grid = context[\"epilogue_grid\"]\n", + " block = context[\"epilogue_block\"]\n", + "\n", + " epilogue_kernel[grid, block](slices, tensor_products, tensor_shift_a, tensor_shift_b, tensor_c, alpha, beta)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5242e285-16c3-48f8-b59d-486ba36e2674", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_func(m, n, k):\n", + " tile_m = 128\n", + " tile_n = 128\n", + " tile_k = 128\n", + " block_size = 128\n", + " \n", + " pipeline_depth = 3\n", + "\n", + " epilogue_tile_m = 16\n", + " epilogue_tile_n = 16\n", + "\n", + " assert m % tile_m == 0, \"Unsupported dimension m for TILE_M\"\n", + " assert n % tile_n == 0, \"Unsupported dimension n for TILE_N\"\n", + " assert k % tile_k == 0, \"Unsupported dimension n for TILE_N\"\n", + " assert k >= (tile_k * pipeline_depth), \"Unsupported pipeline depth for k\"\n", + "\n", + " assert m % epilogue_tile_m == 0, \"Unsupported dimension for EPILOGUE_TILE_M\"\n", + " assert n % epilogue_tile_n == 0, \"Unsupported dimension for EPILOGUE_TILE_N\"\n", + " \n", + " BLAS = Matmul(size=(tile_m, tile_n, tile_k),\n", + " precision=(np.int8, np.int8, np.int32),\n", + " data_type=\"real\",\n", + " alignment=MAX_ALIGNMENT,\n", + " arrangement=(\"row_major\", \"col_major\", \"col_major\"), # Do not change\n", + " execution=\"Block\",\n", + " block_size=block_size,\n", + " with_pipeline=True,\n", + " enable_input_streaming=True,\n", + " static_block_dim=True)\n", + "\n", + " gemm_grid = (m // tile_m, n // tile_n)\n", + " gemm_block = BLAS.block_dim\n", + "\n", + " epilogue_grid = (m // epilogue_tile_m, n // epilogue_tile_n)\n", + " epilogue_block = (epilogue_tile_m, epilogue_tile_n)\n", + "\n", + " return {\n", + " \"BLAS\": BLAS,\n", + " \"PIPELINE_DEPTH\": pipeline_depth,\n", + " \"gemm_kernel\" : get_emulated_gemm_kernel(BLAS),\n", + " \"gemm_grid\": gemm_grid,\n", + " \"gemm_block\": gemm_block,\n", + " \"epilogue_kernel\": get_epilogue_kernel(math.prod(epilogue_block)),\n", + " \"epilogue_grid\": epilogue_grid,\n", + " \"epilogue_block\": epilogue_block\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2e360d9-4015-4381-9e9f-c4f4ffddc3f2", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_partially_fused_emulated_dgemm(problems, setup_func, partial_fused_dgemm_ozaki, epilogue)" + ] + }, + { + "cell_type": "markdown", + "id": "64c366a1-54a9-4bf1-8453-952b0c59a7fa", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ee7849d-b608-40ac-a2d3-0662acf82af9", + "metadata": {}, + "outputs": [], + "source": [ + "def get_emulated_gemm_kernel_solution(BLAS):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " tile_m, tile_n = BLAS.c_dim\n", + " \n", + " @cuda.jit(extensions=pipeline_extensions, launch_bounds=(BLAS.block_size, 1))\n", + " def gemm_kernel(tensor_c, device_pipeline: DevicePipeline):\n", + " _, _, slices = tensor_c.shape\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + "\n", + " smem = cuda.shared.array(shape=(0,), dtype=BLAS.a_value_type, alignment=device_pipeline.buffer_alignment)\n", + "\n", + " block_start_m = block_m * tile_m\n", + " block_end_m = (block_m + 1) * tile_m\n", + "\n", + " block_start_n = block_n * tile_n\n", + " block_end_n = (block_n + 1) * tile_n\n", + "\n", + " initial_diag = slices - 1\n", + " initial_term = 0\n", + "\n", + " tile_pipeline = device_pipeline.get_tile(smem,\n", + " (block_m, np.int32(initial_term)),\n", + " (block_n, np.int32(initial_diag)))\n", + "\n", + " c_views = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " :\n", + " ]\n", + " ldc = max(c_views.strides[:2]) // c_views.itemsize\n", + " \n", + " accumulator = BLAS.suggest_accumulator()\n", + " for diag in range(initial_diag, -1, -1):\n", + " accumulator.clear()\n", + "\n", + " for term in range(initial_term, diag + 1):\n", + " tile_pipeline.execute(accumulator)\n", + "\n", + " next_slice_row = 0 if term == diag else term + 1\n", + " next_slice_col = (diag - 1) if term == diag else diag - next_slice_row\n", + "\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " (block_m, np.int32(next_slice_row)),\n", + " (block_n, np.int32(next_slice_col)))\n", + "\n", + " if accumulator.is_thread_active():\n", + " gmem_c = make_tensor(c_views[:,:,diag], BLAS.get_layout_gmem_c(ldc))\n", + " accumulator.partition_and_copy(accumulator.get_results(), gmem_c)\n", + "\n", + " tile_pipeline._del()\n", + "\n", + " return gemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a9b39a1-6ad0-46bf-b8a0-190893ebd8c5", + "metadata": {}, + "outputs": [], + "source": [ + "def get_epilogue_kernel_solution(block_size=64):\n", + " uint8_width = get_width(np.uint8)\n", + "\n", + " @cuda.jit(device=True, forceinline=True)\n", + " def nth_slice_to_fp64(nth, nth_slice, exponent_shift):\n", + " ko = math.pow(2.0, -nth * uint8_width)\n", + "\n", + " value = ko * np.float64(nth_slice)\n", + " return epilogue_ldexp(value, -exponent_shift)\n", + "\n", + " @cuda.jit(launch_bounds=(block_size, 1))\n", + " def epilogue_kernel(slices, tensor_diag, tensor_shift_a, tensor_shift_b, tensor_out, alpha, beta):\n", + " tid_m = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n", + " tid_n = cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y\n", + "\n", + " shift_a = tensor_shift_a[tid_m]\n", + " shift_b = tensor_shift_b[tid_n]\n", + "\n", + " diag_view = tensor_diag[tid_m, tid_n, :]\n", + "\n", + " result = 0.0\n", + " for diag in range(slices-1, -1, -1):\n", + " result += nth_slice_to_fp64(diag, diag_view[diag], shift_a + shift_b)\n", + "\n", + " if beta != 0:\n", + " result = alpha * result + beta * tensor_out[tid_m, tid_n]\n", + " else:\n", + " result = alpha * result\n", + "\n", + " tensor_out[tid_m, tid_n] = result\n", + "\n", + " return epilogue_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bef11022-2c16-4c87-9f97-bb9701f44b66", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_func_solution(m, n, k):\n", + " ctx = setup_func(m, n, k)\n", + " BLAS = ctx[\"BLAS\"]\n", + " epilogue_block = ctx[\"epilogue_block\"]\n", + " ctx[\"gemm_kernel\"] = get_emulated_gemm_kernel_solution(BLAS);\n", + " ctx[\"epilogue_kernel\"] = get_epilogue_kernel_solution(math.prod(epilogue_block))\n", + " \n", + " return ctx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e64dc1fa-cbb4-4182-9010-da3956cc7e3a", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_partially_fused_emulated_dgemm(problems, setup_func_solution, partial_fused_dgemm_ozaki, epilogue)" + ] + }, + { + "cell_type": "markdown", + "id": "aca5fd0d-95b7-4133-a814-14e04c133a30", + "metadata": {}, + "source": [ + "### Performance Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d5ebee6-e9e2-4a86-9308-5456a39acbf6", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import math\n", + "\n", + "# INT8 TOPS, MEMORY BANDWIDTH (GB/s)\n", + "GPU_SPECS = {\n", + " \"L40S\": (733, 864),\n", + " \"B200\": (4500, 8000)\n", + "}\n", + "\n", + "# NOTE: This model is very simplistic and does not take quantization or other overheads like slicing and FP64 operations into account\n", + "def roofline_prediction_3_2(m, n, k, slices=7, TILE_M=128, TILE_N=128, TILE_K=128):\n", + " INT8_TOPS, MEMORY_BANDWIDTH_GBS = GPU_SPECS[\"L40S\"]\n", + "\n", + " num_products = (slices * (slices + 1)) // 2\n", + "\n", + " # By design since each thread is computing one output element\n", + " tiles = math.ceil(m / TILE_M) * math.ceil(n / TILE_N)\n", + "\n", + " # Each tile does TILE_M * TILE_N dot products which each have k multiplications and k additions for every product\n", + " flops_per_tile = 2 * TILE_M * TILE_N * k * num_products\n", + "\n", + " fp64_size = np.dtype(np.float64).itemsize\n", + " int32_size = np.dtype(np.float64).itemsize\n", + " int8_size = np.dtype(np.int8).itemsize\n", + "\n", + " # We load a TILE_M rows of matrix A, TILE_N columns of matrix B for each product.\n", + " # Then, we read from and write to TILE_M * TILE_N elements of matrix C\n", + " # This needs to happen once for each diagonal\n", + " memory_per_tile = ((TILE_M * k + TILE_N * k) * int8_size + 2 * TILE_M * TILE_N * int32_size) * num_products\n", + "\n", + " # In the epilogue kernel, we load the products and write the output\n", + " memory_per_tile += (TILE_M * TILE_N) * (num_products * int32_size + fp64_size)\n", + "\n", + " total_memory_gb = tiles * memory_per_tile * 1e-9\n", + " total_tflop = tiles * flops_per_tile * 1e-12\n", + "\n", + " return total_tflop / INT8_TOPS, total_memory_gb / MEMORY_BANDWIDTH_GBS\n", + "\n", + "time_flops, time_membw = roofline_prediction_3_2(2048, 2048, 2048)\n", + "\n", + "print(f\"The runtime from the math operations {time_flops * 1e3} ms and the runtime from memory is {time_membw * 1e3} ms\")\n", + "\n", + "# We will either be bottlenecked by FLOPS or Memory Bandwidth, so we take the maximum\n", + "print(f\"Therefore, the estimated best case runtime is {max(time_flops, time_membw) * 1e3} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "1aba46df-a842-4400-a646-3179b64e9b32", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this exercise, we've learned how we can use the pipeline APIs to implement more complex routines. Specifically, we've learned how to:\n", + "\n", + "1. Clear the accumulator when we are ready for a new computation\n", + "2. Reset the device pipeline accumulator for new calculations\n", + "3. How to iterate over 3D tensors with the pipeline API" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/03.03-FusedEmulation.ipynb b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/03.03-FusedEmulation.ipynb new file mode 100644 index 00000000..6daf8fe7 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/03.03-FusedEmulation.ipynb @@ -0,0 +1,960 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "24069346-2c9c-4d6b-b812-d59f2804e36c", + "metadata": {}, + "source": [ + "## Exercise 3.3: Fully fused Ozaki-I Scheme\n", + "\n", + "Our next optimization will be to fuse the remaining portions of the epilogue function. This would be casting anti-diagonal accumulators to FP64, scaling the FP64 values, accumulating anti-diagonals, and scaling based on the exponent shifting done for slicing.\n", + "\n", + "Once you are done, spend some time profiling the kernels across a few different problem shapes. Think about how the results change and why. What factors are causing this? Hint: look at the grid dimensions and consider hardware resources.\n", + "\n", + "Some other questions you can consider:\n", + "1. Where does fusion seem to help the most?\n", + "2. Can you find cases where fusion does not help?\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "6a0a204e-ea61-49a6-a97d-e783f3912013", + "metadata": {}, + "source": [ + "### C++ Cmake Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c44386b3-b024-429a-b78a-1ca743f73e86", + "metadata": {}, + "outputs": [], + "source": [ + "import sys, os\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "from common_cuda import setup_cmake_project\n", + "setup_cmake_project()" + ] + }, + { + "cell_type": "markdown", + "id": "657980a5-14a1-4d9f-b013-cade838938a1", + "metadata": {}, + "source": [ + "### Python Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24745538-973e-4721-bb29-7aa4e19dd505", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "import numpy as np\n", + "import cupy as cp\n", + "import nvmath\n", + "import math\n", + "\n", + "from nvmath.device import Matmul\n", + "from nvmath.device.cublasdx import DevicePipeline, SharedStorageCalc, MAX_ALIGNMENT\n", + "from nvmath.device.cublasdx_numba import pipeline_extensions\n", + "from nvmath.device.common import axpby, clear, copy, copy_fragment, copy_wait, make_tensor, make_fragment_like\n", + "from numba import cuda\n", + "\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "\n", + "from benchmark import *\n", + "from emulation_utils import get_width, epilogue_ldexp" + ] + }, + { + "cell_type": "markdown", + "id": "6c55d2c0-5479-4d9e-abbd-3ac0bcb7f0b4", + "metadata": {}, + "source": [ + "### C++" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddac7a49-07ab-4337-979c-aa9e121902f0", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2c_fully_fused_emulation/parameters.hpp.inc\n", + "\n", + " // ===================================\n", + " // Problem configuration\n", + " // ===================================\n", + "\n", + " // (gemm_m, gemm_n, gemm_k, alpha, beta)\n", + " std::vector problems = {\n", + " {2048, 2048, 2048, 0.9, 1.1}\n", + " };\n", + " \n", + "\n", + " // ===================================\n", + " // Global GEMM configuration\n", + " // ===================================\n", + "\n", + " // The number of slices used in emulation algorithm\n", + " // More slices = higher precision but more computation\n", + " constexpr unsigned slices = 7;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9545479-9d2f-4130-b410-173f9cf87943", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2c_fully_fused_emulation/cublasdx_config.hpp.inc\n", + "\n", + " using slice_value_type = int8_t; // Precision for individual slices\n", + " using accumulator_value_type = int32_t; // Precision for accumulation\n", + "\n", + " // The shape of data tile processed by a single CTA block\n", + " constexpr int tile_m = 128;\n", + " constexpr int tile_n = 128;\n", + " constexpr int tile_k = 128;\n", + "\n", + " // The shape of CTA block (number of threads)\n", + " constexpr int cta_shape_x = 128;\n", + " constexpr int cta_shape_y = 1;\n", + " constexpr int cta_shape_z = 1;\n", + "\n", + " using BLAS = decltype(cublasdx::Size() +\n", + " cublasdx::Precision() +\n", + " cublasdx::Type() + cublasdx::Function() +\n", + " cublasdx::Arrangement() + cublasdx::Block() +\n", + " cublasdx::BlockDim() + cublasdx::StaticBlockDim() +\n", + " cublasdx::WithPipeline() + cublasdx::MaxAlignment() + cublasdx::EnableInputStreaming() +\n", + " cublasdx::SM());" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a0f84-9477-4294-b59b-f4d806411cba", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2c_fully_fused_emulation/pipeline_config.hpp.inc\n", + "\n", + " constexpr int pipeline_depth = 3;\n", + " auto device_pipeline = cublasdx::suggest_device_pipeline(\n", + " tensor_slice_a, tensor_slice_b)\n", + " .value();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1013bf66-8ad0-441d-b86e-694588047e84", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2c_fully_fused_emulation/fused_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__\n", + " void fused_epilogue_kernel(__grid_constant__ DevicePipeline const device_pipeline,\n", + " Alpha alpha,\n", + " Beta beta,\n", + " CTensor gmem_c_fp64,\n", + " AShiftTensor const gmem_shift_a,\n", + " BShiftTensor const gmem_shift_b) {\n", + " extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[];\n", + "#ifdef __CUDA_ARCH__\n", + " /* \n", + " * EXERCISE --> Complete the kernel to compute all products, accumulate along diagonals, and convert back to FP64\n", + " */\n", + " if constexpr (cublasdx::sm_of_v == __CUDA_ARCH__) {\n", + " // ================================\n", + " // 1. SETUP AND TILE PREPARATION\n", + " // ================================\n", + "\n", + " constexpr int tile_m = cublasdx::size_of_v_m;\n", + " constexpr int tile_n = cublasdx::size_of_v_n;\n", + "\n", + " // EXERCISE --> Choose the diagonal and term along the diagonal that you'd like to start with\n", + " constexpr auto initial_diag = \n", + " constexpr auto initial_term = \n", + "\n", + " auto [pipeline_smem, smem_shift_a, smem_shift_b] =\n", + " cublasdx::shared_memory::slice(smem,\n", + " device_pipeline.buffer_alignment(),\n", + " device_pipeline.buffer_size(),\n", + " cublasdx::alignment_of_v_a,\n", + " cute::make_layout(cute::Int()),\n", + " cublasdx::alignment_of_v_b,\n", + " cute::make_layout(cute::Int()));\n", + "\n", + " // Copy general purpose data\n", + " cublasdx::copy(gmem_shift_a(cute::_, blockIdx.x), smem_shift_a);\n", + " cublasdx::copy(gmem_shift_b(cute::_, blockIdx.y), smem_shift_b);\n", + " cublasdx::copy_wait();\n", + " \n", + " // Get pipeline tile\n", + " auto tile_pipeline = device_pipeline.get_tile(pipeline_smem,\n", + " cublasdx::make_coord(blockIdx.x, initial_term),\n", + " cublasdx::make_coord(blockIdx.y, initial_diag));\n", + "\n", + " auto accumulator = tile_pipeline.get_accumulator();\n", + "\n", + " // ================================\n", + " // 2. FP64 C INPUT / OUTPUT TILE SETUP\n", + " // ================================\n", + "\n", + " auto tile_c_fp64_gmem = cublasdx::get_tile(gmem_c_fp64, BLAS::c_shape, blockIdx.x, blockIdx.y);\n", + "\n", + " // ============================================\n", + " // 3. OZAKI SCHEME DIAGONAL ITERATION\n", + " // ============================================\n", + "# pragma unroll 1\n", + " for (int diag = initial_diag; /* for loop over diagonals */) {\n", + "\n", + " // Initialize accumulator for this diagonal\n", + " accumulator.clear();\n", + "\n", + " // ==========================================\n", + " // 4. SLICE COMBINATION COMPUTATION\n", + " // ==========================================\n", + "# pragma unroll 1\n", + " for (int term = initial_term; /* for loop to iterate along the diagonal */) {\n", + " // =========================================\n", + " // 5. N-STAGE MEMORY PIPELINE FOR GEMM\n", + " // =========================================\n", + "\n", + " tile_pipeline.execute(accumulator);\n", + "\n", + " const auto next_slice_row = // A slice index\n", + " const auto next_slice_col = // B slice index\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " cublasdx::make_coord(blockIdx.x, next_slice_row),\n", + " cublasdx::make_coord(blockIdx.y, next_slice_col));\n", + " }\n", + "\n", + " // ========================================\n", + " // 6. RESULT RECONSTRUCTION AND EPILOGUE\n", + " // ========================================\n", + "\n", + " auto c_fp64_frag = accumulator.make_partition_and_copy(tile_c_fp64_gmem);\n", + "\n", + " if (accumulator.is_thread_active()) {\n", + " // Convert accumulated int32_t results back to double precision\n", + " // and apply appropriate scaling based on slice positions\n", + " auto gemm_results = accumulator.get_results();\n", + "\n", + " // Load existing C values\n", + " auto d_fp64_frag = cublasdx::make_fragment_like(gemm_results);\n", + "\n", + " // At this point of the computation, we can no longer longer do tile based operations. When we convert back to\n", + " // FP64 we need to know the shifts associated with the row of A and column of B that produced this value. The\n", + " // cublasDx library gives us the ability to figure out the relative index within the tile. We can use this to\n", + " // find our shifts, do some intermediate computations, and then proceed with more tile computations.\n", + " \n", + " # pragma unroll\n", + " for (int i = 0; i < cublasdx::size(d_fp64_frag); ++i) {\n", + " const auto [global_x, global_y] = accumulator.map_fragment_index(i);\n", + "\n", + " // Exercise --> Use shared memory to get the shifts for this particular element\n", + " const auto shift_a_elem = \n", + " const auto shift_b_elem = \n", + "\n", + " // Convert int32_t slice result back to double precision\n", + " // with appropriate scaling for this diagonal and element\n", + " d_fp64_frag(i) = nth_slice_to_fp64(diag, gemm_results(i), shift_a_elem + shift_b_elem);\n", + " }\n", + "\n", + " // Apply alpha/beta scaling and accumulate into C\n", + " // Use beta only for the first diagonal we process, then just add (beta=1.0)\n", + " double beta_used = beta;\n", + " if (/* EXERCISE --> Figure out when to use 1.0 for beta */) {\n", + " beta_used = 1.0;\n", + " }\n", + " cublasdx::axpby(alpha, d_fp64_frag, beta_used, c_fp64_frag); \n", + " }\n", + " \n", + " // Store results back to global memory\n", + " accumulator.partition_and_copy(c_fp64_frag, tile_c_fp64_gmem);\n", + " }\n", + " }\n", + "#endif\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c94a5e0-4f37-4a41-9524-31bf6f593cc4", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!cmake --build ./build -t 2c_fully_fused_emulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29399cdc-c197-4e78-9e05-f2b9833dffc9", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!./build/2c_fully_fused_emulation" + ] + }, + { + "cell_type": "markdown", + "id": "357d5be5-c802-451d-b94b-c57a9571e009", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "markdown", + "id": "5ea1c518-5494-42bd-8eca-a44314c9c2dc", + "metadata": { + "execution": { + "iopub.execute_input": "2026-01-24T23:49:12.098964Z", + "iopub.status.busy": "2026-01-24T23:49:12.098749Z", + "iopub.status.idle": "2026-01-24T23:49:12.237397Z", + "shell.execute_reply": "2026-01-24T23:49:12.236569Z", + "shell.execute_reply.started": "2026-01-24T23:49:12.098941Z" + } + }, + "source": [ + "We will rewrite kernel now and recompile the solution. If you want to restart your exercise make sure you rewrite kernel back and recompile it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbd00ed9-d723-4266-b1ee-b3eca06442ee", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/2c_fully_fused_emulation/fused_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__\n", + " void fused_epilogue_kernel(__grid_constant__ DevicePipeline const device_pipeline,\n", + " Alpha alpha,\n", + " Beta beta,\n", + " CTensor gmem_c_fp64,\n", + " AShiftTensor const gmem_shift_a,\n", + " BShiftTensor const gmem_shift_b) {\n", + " extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[];\n", + "#ifdef __CUDA_ARCH__\n", + " if constexpr (cublasdx::sm_of_v == __CUDA_ARCH__) {\n", + " // ================================\n", + " // 1. SETUP AND TILE PREPARATION\n", + " // ================================\n", + "\n", + " constexpr int tile_m = cublasdx::size_of_v_m;\n", + " constexpr int tile_n = cublasdx::size_of_v_n;\n", + "\n", + " constexpr auto initial_diag = Slices - 1;\n", + " constexpr auto initial_term = 0;\n", + "\n", + " auto [pipeline_smem, smem_shift_a, smem_shift_b] =\n", + " cublasdx::shared_memory::slice(smem,\n", + " device_pipeline.buffer_alignment(),\n", + " device_pipeline.buffer_size(),\n", + " cublasdx::alignment_of_v_a,\n", + " cute::make_layout(cute::Int()),\n", + " cublasdx::alignment_of_v_b,\n", + " cute::make_layout(cute::Int()));\n", + "\n", + " // Copy general purpose data\n", + " cublasdx::copy(gmem_shift_a(cute::_, blockIdx.x), smem_shift_a);\n", + " cublasdx::copy(gmem_shift_b(cute::_, blockIdx.y), smem_shift_b);\n", + " cublasdx::copy_wait();\n", + "\n", + "\n", + " // Get pipeline tile\n", + " auto tile_pipeline = device_pipeline.get_tile(pipeline_smem,\n", + " cublasdx::make_coord(blockIdx.x, initial_term),\n", + " cublasdx::make_coord(blockIdx.y, initial_diag));\n", + "\n", + " auto accumulator = tile_pipeline.get_accumulator();\n", + "\n", + " // ================================\n", + " // 2. FP64 C INPUT / OUTPUT TILE SETUP\n", + " // ================================\n", + "\n", + " auto tile_c_fp64_gmem = cublasdx::get_tile(gmem_c_fp64, BLAS::c_shape, blockIdx.x, blockIdx.y);\n", + "\n", + " // ============================================\n", + " // 3. OZAKI SCHEME DIAGONAL ITERATION\n", + " // ============================================\n", + "\n", + " // Iterate over diagonals in reverse order (highest power of 2 first)\n", + " // This ensures proper accumulation order for numerical stability\n", + "# pragma unroll 1\n", + " for (auto diag = initial_diag; diag >= 0; --diag) {\n", + "\n", + " // Initialize accumulator for this diagonal\n", + " accumulator.clear();\n", + "\n", + " // ==========================================\n", + " // 4. SLICE COMBINATION COMPUTATION\n", + " // ==========================================\n", + "\n", + " // Compute all slice combinations that contribute to this diagonal\n", + " // For diagonal d, we compute: A_slice[i] * B_slice[d-i] for i = 0 to d\n", + "# pragma unroll 1\n", + " for (auto term = initial_term; term <= diag; ++term) {\n", + " // =========================================\n", + " // 5. N-STAGE MEMORY PIPELINE FOR GEMM\n", + " // =========================================\n", + "\n", + " tile_pipeline.execute(accumulator);\n", + "\n", + " const auto next_slice_row = (term == diag) ? 0 : term + 1; // A slice index\n", + " const auto next_slice_col = (term == diag) ? (diag - 1) : (diag - next_slice_row); // B slice index\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " cublasdx::make_coord(blockIdx.x, next_slice_row),\n", + " cublasdx::make_coord(blockIdx.y, next_slice_col));\n", + " } /* end of slice combination loop */\n", + "\n", + " // ========================================\n", + " // 6. RESULT RECONSTRUCTION AND EPILOGUE\n", + " // ========================================\n", + "\n", + " // Load existing C values\n", + " auto c_fp64_frag = accumulator.make_partition_and_copy(tile_c_fp64_gmem);\n", + "\n", + " if (accumulator.is_thread_active()) {\n", + " // Convert accumulated int32_t results back to double precision\n", + " // and apply appropriate scaling based on slice positions\n", + " auto gemm_results = accumulator.get_results();\n", + "\n", + " auto d_fp64_frag = cublasdx::make_fragment_like(gemm_results);\n", + "\n", + " // At this point of the compuation, we can no longer longer do tile based operations. When we convert back to\n", + " // FP64 we need to know the shifts associated with the row of A and column of B that produced this value. The\n", + " // cublasDx library gives us the ability to figure out the relative index within the tile. We can use this to\n", + " // find our shifts, do some intermediate computations, and then proceed with more tile computations.\n", + "\n", + " # pragma unroll\n", + " for (int i = 0; i < cublasdx::size(d_fp64_frag); ++i) {\n", + " const auto [global_x, global_y] = accumulator.map_fragment_index(i);\n", + " const auto shift_a_elem = smem_shift_a(global_x);\n", + " const auto shift_b_elem = smem_shift_b(global_y);\n", + "\n", + " // Convert int32_t slice result back to double precision\n", + " // with appropriate scaling for this diagonal and element\n", + " d_fp64_frag(i) =\n", + " nth_slice_to_fp64(diag, gemm_results(i), shift_a_elem + shift_b_elem);\n", + " }\n", + "\n", + " // Apply alpha/beta scaling and accumulate into C\n", + " // Use beta only for the first diagonal (highest order), then just add (beta=1.0)\n", + " cublasdx::axpby(alpha, d_fp64_frag, (diag == Slices - 1) ? beta : 1.0, c_fp64_frag);\n", + " }\n", + "\n", + " // Store results back to global memory\n", + " accumulator.partition_and_copy(c_fp64_frag, tile_c_fp64_gmem); \n", + " }\n", + " }\n", + "#endif\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "740fadd9-f450-4f10-abe8-8f40e683a44f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!cmake --build ./build -t 2c_fully_fused_emulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5f8cefd-d8f2-4bae-9923-c2e309468cb2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!./build/2c_fully_fused_emulation" + ] + }, + { + "cell_type": "markdown", + "id": "c2683baa-4549-43a7-94e7-2915ea6a8838", + "metadata": {}, + "source": [ + "### Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "891635c0-7eef-49c7-b713-47ec1aea9795", + "metadata": {}, + "outputs": [], + "source": [ + "problems = [\n", + " (2048, 2048, 2048, 0.9, 1.1),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9628581e-2410-4ce1-ad6e-d6c3ec2b6b32", + "metadata": {}, + "outputs": [], + "source": [ + "def get_emulated_dgemm_kernel(BLAS):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " tile_m, tile_n = BLAS.c_dim\n", + "\n", + " uint8_width = get_width(np.uint8)\n", + " \n", + " @cuda.jit(device=True, forceinline=True)\n", + " def nth_slice_to_fp64(nth, nth_slice, exponent_shift):\n", + " ko = math.pow(2.0, -nth * uint8_width)\n", + "\n", + " value = ko * np.float64(nth_slice)\n", + " return epilogue_ldexp(value, -exponent_shift)\n", + " \n", + " @cuda.jit(extensions=pipeline_extensions, launch_bounds=(BLAS.block_size, 1))\n", + " def dgemm_kernel(slices, shift_a_tensor, shift_b_tensor, alpha, beta, tensor_c, device_pipeline: DevicePipeline):\n", + " m, n = tensor_c.shape\n", + "\n", + " # EXERCISE --> Complete the kernel to compute all products, accumulate along diagonals, and convert back to FP64\n", + "\n", + " # ================================\n", + " # 1. SETUP AND TILE PREPARATION\n", + " # ================================\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + " \n", + " # EXERCISE --> Choose the diagonal and term along the diagonal that you'd like to start with\n", + " initial_diag = -1\n", + " initial_term = -1\n", + "\n", + " smem = cuda.shared.array(shape=(0,), dtype=np.int8, alignment=device_pipeline.buffer_alignment)\n", + " smem_pipeline, smem = smem[:device_pipeline.buffer_size], smem[device_pipeline.buffer_size:].view(np.int32)\n", + " smem_shift_a, smem = smem[:tile_m], smem[tile_m:]\n", + " smem_shift_b, smem = smem[:tile_n], smem[tile_n:]\n", + "\n", + " # Copy general purpose data\n", + " block_start_m = block_m * tile_m\n", + " block_end_m = (block_m + 1) * tile_m\n", + "\n", + " block_start_n = block_n * tile_n\n", + " block_end_n = (block_n + 1) * tile_n\n", + "\n", + " if block_start_m >= m or block_start_n >= n:\n", + " return\n", + "\n", + " shift_a_view = shift_a_tensor[block_start_m : block_end_m]\n", + " shift_b_view = shift_b_tensor[block_start_n : block_end_n]\n", + "\n", + " tid = cuda.threadIdx.x\n", + " if tid < tile_m:\n", + " smem_shift_a[tid] = shift_a_view[tid]\n", + " if tid < tile_n:\n", + " smem_shift_b[tid] = shift_b_view[tid]\n", + " cuda.syncthreads()\n", + "\n", + " # Get pipeline tile\n", + " tile_pipeline = device_pipeline.get_tile(smem_pipeline,\n", + " (block_m, np.int32(initial_term)),\n", + " (block_n, np.int32(initial_diag)))\n", + " \n", + " accumulator = BLAS.suggest_accumulator()\n", + "\n", + " # ================================\n", + " # 2. FP64 C INPUT / OUTPUT TILE SETUP\n", + " # ================================\n", + "\n", + " c_view = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " ]\n", + "\n", + " ldc = max(c_view.strides) // c_view.itemsize\n", + " gmem_c = make_tensor(c_view, BLAS.get_layout_gmem_c(ldc))\n", + " \n", + " # ============================================\n", + " # 3. OZAKI SCHEME DIAGONAL ITERATION\n", + " # ============================================\n", + " for diag in range(-1): # EXERCISE --> for loop over diagonals\n", + " \n", + " accumulator.clear()\n", + "\n", + " # ==========================================\n", + " # 4. SLICE COMBINATION COMPUTATION\n", + " # ==========================================\n", + " for term in range(-1): # EXERCISE --> for loop to iterate along the diagonal\n", + " # =========================================\n", + " # 5. N-STAGE MEMORY PIPELINE FOR GEMM\n", + " # =========================================\n", + " tile_pipeline.execute(accumulator)\n", + "\n", + " # EXERCISE --> Determine which slice of A and slice of B to multiply\n", + " next_slice_row = -1\n", + " next_slice_col = -1\n", + "\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " (block_m, np.int32(next_slice_row)),\n", + " (block_n, np.int32(next_slice_col)))\n", + "\n", + " # ========================================\n", + " # 6. RESULT RECONSTRUCTION AND EPILOGUE\n", + " # ========================================\n", + " if accumulator.is_thread_active():\n", + " # Convert accumulated int32_t results back to double precision\n", + " # and apply appropriate scaling based on slice positions\n", + " gemm_results = accumulator.get_results()\n", + "\n", + " # Load existing C values\n", + " c_fp64_frag = accumulator.make_partition_and_copy(gmem_c)\n", + " d_fp64_frag = make_fragment_like(gemm_results, np.float64)\n", + "\n", + " # At this point of the compuation, we can no longer longer do tile based operations. When we convert back to\n", + " # FP64 we need to know the shifts associated with the row of A and column of B that produced this value. The\n", + " # nvmath-python cublasDx bindings give us the ability to figure out the relative index within the tile. We can\n", + " # use this to find our shifts, do some intermediate computations, and then proceed with more tile computations.\n", + " for i in range(c_fp64_frag.layout.size):\n", + " # Get the elements offsets within the output tile\n", + " (global_x, global_y) = accumulator.map_fragment_index(i)\n", + " # Exercise --> Use shared memory to get the shifts for this particular element\n", + " shift_a = -1\n", + " shift_b = -1\n", + "\n", + " # Convert int32_t slice result back to double precision\n", + " # with appropriate scaling for this diagonal and element\n", + " d_fp64_frag[i] = nth_slice_to_fp64(diag, gemm_results[i], shift_a + shift_b)\n", + "\n", + " # Apply alpha/beta scaling and accumulate into C\n", + " # Use beta only for the first diagonal we process, then just add (beta=1.0)\n", + " beta_used = beta\n", + " if True: # EXERCISE -> Figure out when to use 1.0 for beta\n", + " beta_used = 1.0\n", + " axpby(alpha, d_fp64_frag, beta_used, c_fp64_frag)\n", + "\n", + " accumulator.partition_and_copy(c_fp64_frag, gmem_c)\n", + "\n", + " tile_pipeline._del()\n", + "\n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f7cee2d-006f-452b-a9d8-b83930f9160e", + "metadata": {}, + "outputs": [], + "source": [ + "def fused_dgemm_ozaki(tensor_slicedA_cupy, tensor_slicedB_cupy, tensor_c_cupy, tensor_shift_a_cupy, tensor_shift_b_cupy, alpha, beta, context, warmup=True):\n", + " m, n = tensor_c_cupy.shape\n", + " _, k, slices = tensor_slicedA_cupy.shape\n", + "\n", + " BLAS = context[\"BLAS\"]\n", + " PIPELINE_DEPTH = context[\"PIPELINE_DEPTH\"]\n", + " gemm_kernel = context[\"gemm_kernel\"]\n", + " grid = context[\"gemm_grid\"]\n", + " block = context[\"gemm_block\"]\n", + "\n", + " TILE_M, TILE_N = BLAS.c_dim\n", + "\n", + " tensor_slicedA = cuda.as_cuda_array(tensor_slicedA_cupy)\n", + " tensor_slicedB = cuda.as_cuda_array(tensor_slicedB_cupy)\n", + " tensor_shift_a = cuda.as_cuda_array(tensor_shift_a_cupy)\n", + " tensor_shift_b = cuda.as_cuda_array(tensor_shift_b_cupy)\n", + " tensor_c = cuda.as_cuda_array(tensor_c_cupy)\n", + "\n", + " device_pipeline = BLAS.suggest_device_pipeline(PIPELINE_DEPTH, tensor_slicedA, tensor_slicedB)\n", + "\n", + " smem_size = device_pipeline.buffer_size + (TILE_M + TILE_N) * np.dtype(np.int32).itemsize\n", + " if warmup:\n", + " set_max_dynamic_shared_size_bytes(gemm_kernel, smem_size,\n", + " slices, tensor_shift_a, tensor_shift_b, alpha, beta, tensor_c, device_pipeline)\n", + " gemm_kernel[grid, block, 0, smem_size](slices, tensor_shift_a, tensor_shift_b, alpha, beta, tensor_c, device_pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4ec6e27-8186-4e47-8ce7-de9f3da75306", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_func(m, n, k):\n", + " tile_m = 128\n", + " tile_n = 128\n", + " tile_k = 128\n", + " \n", + " pipeline_depth = 3\n", + " block_size = 128\n", + "\n", + " assert m % tile_m == 0, \"Unsupported dimension m for TILE_M\"\n", + " assert n % tile_n == 0, \"Unsupported dimension n for TILE_N\"\n", + " assert k % tile_k == 0, \"Unsupported dimension k for TILE_K\"\n", + " assert k >= (tile_k * pipeline_depth), \"Unsupported pipeline depth for k\"\n", + " \n", + " BLAS = Matmul(size=(tile_m, tile_n, tile_k),\n", + " precision=(np.int8, np.int8, np.int32),\n", + " data_type=\"real\",\n", + " alignment=MAX_ALIGNMENT,\n", + " arrangement=(\"row_major\", \"col_major\", \"col_major\"), # Do not change\n", + " execution=\"Block\",\n", + " block_size=block_size,\n", + " with_pipeline=True,\n", + " enable_input_streaming=True,\n", + " static_block_dim=True)\n", + "\n", + " gemm_grid = (m // tile_m, n // tile_n)\n", + " gemm_block = BLAS.block_dim\n", + "\n", + " return {\n", + " \"BLAS\": BLAS,\n", + " \"PIPELINE_DEPTH\": pipeline_depth,\n", + " \"gemm_kernel\" : get_emulated_dgemm_kernel(BLAS),\n", + " \"gemm_grid\": gemm_grid,\n", + " \"gemm_block\": gemm_block,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7a1d6ef-b04c-455b-a2e0-fbf80925e2de", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_fused_emulated_dgemm(problems, setup_func, fused_dgemm_ozaki)" + ] + }, + { + "cell_type": "markdown", + "id": "f970edfd-cd9c-4600-b865-feb23b59471f", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "925d58a0-5507-4a8c-a95b-c1d316fa24eb", + "metadata": {}, + "outputs": [], + "source": [ + "def get_emulated_dgemm_kernel_solution(BLAS):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " tile_m, tile_n = BLAS.c_dim\n", + "\n", + " uint8_width = get_width(np.uint8)\n", + " \n", + " @cuda.jit(device=True, forceinline=True)\n", + " def nth_slice_to_fp64(nth, nth_slice, exponent_shift):\n", + " ko = math.pow(2.0, -nth * uint8_width)\n", + "\n", + " value = ko * np.float64(nth_slice)\n", + " return epilogue_ldexp(value, -exponent_shift)\n", + " \n", + " @cuda.jit(extensions=pipeline_extensions, launch_bounds=(BLAS.block_size, 1))\n", + " def dgemm_kernel(slices, shift_a_tensor, shift_b_tensor, alpha, beta, tensor_c, device_pipeline: DevicePipeline):\n", + " m, n = tensor_c.shape\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + "\n", + " smem = cuda.shared.array(shape=(0,), dtype=np.int8, alignment=device_pipeline.buffer_alignment)\n", + " smem_pipeline, smem = smem[:device_pipeline.buffer_size], smem[device_pipeline.buffer_size:].view(np.int32)\n", + " smem_shift_a, smem = smem[:tile_m], smem[tile_m:]\n", + " smem_shift_b, smem = smem[:tile_n], smem[tile_n:]\n", + "\n", + " block_start_m = block_m * tile_m\n", + " block_end_m = (block_m + 1) * tile_m\n", + "\n", + " block_start_n = block_n * tile_n\n", + " block_end_n = (block_n + 1) * tile_n\n", + "\n", + " if block_start_m >= m or block_start_n >= n:\n", + " return\n", + "\n", + " shift_a_view = shift_a_tensor[block_start_m : block_end_m]\n", + " shift_b_view = shift_b_tensor[block_start_n : block_end_n]\n", + "\n", + " tid = cuda.threadIdx.x\n", + " if tid < tile_m:\n", + " smem_shift_a[tid] = shift_a_view[tid]\n", + " if tid < tile_n:\n", + " smem_shift_b[tid] = shift_b_view[tid]\n", + " cuda.syncthreads()\n", + "\n", + " c_view = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " ]\n", + "\n", + " ldc = max(c_view.strides) // c_view.itemsize\n", + " gmem_c = make_tensor(c_view, BLAS.get_layout_gmem_c(ldc))\n", + " \n", + " initial_diag = slices - 1\n", + " initial_term = 0\n", + "\n", + " tile_pipeline = device_pipeline.get_tile(smem_pipeline,\n", + " (block_m, np.int32(initial_term)),\n", + " (block_n, np.int32(initial_diag)))\n", + " \n", + " accumulator = BLAS.suggest_accumulator()\n", + " for diag in range(initial_diag, -1, -1):\n", + " \n", + " accumulator.clear()\n", + " for term in range(initial_term, diag + 1):\n", + " tile_pipeline.execute(accumulator)\n", + "\n", + " next_slice_row = 0 if term == diag else term + 1\n", + " next_slice_col = (diag - 1) if term == diag else diag - next_slice_row\n", + "\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " (block_m, np.int32(next_slice_row)),\n", + " (block_n, np.int32(next_slice_col)))\n", + "\n", + " if accumulator.is_thread_active():\n", + " gemm_results = accumulator.get_results()\n", + " \n", + " acc_fp64_frag = make_fragment_like(gemm_results, np.float64)\n", + " c_fp64_frag = accumulator.make_partition_and_copy(gmem_c)\n", + " \n", + " # At this point of the compuation, we can no longer longer do tile based operations. When we convert back to\n", + " # FP64 we need to know the shifts associated with the row of A and column of B that produced this value. The\n", + " # nvmath-python cublasDx bindings give us the ability to figure out the relative index within the tile. We can\n", + " # use this to find our shifts, do some intermediate computations, and then proceed with more tile computations.\n", + " for i in range(c_fp64_frag.layout.size):\n", + " (global_x, global_y) = accumulator.map_fragment_index(i)\n", + " shift_a = smem_shift_a[global_x]\n", + " shift_b = smem_shift_b[global_y]\n", + " \n", + " acc_fp64_frag[i] = nth_slice_to_fp64(diag, gemm_results[i], shift_a + shift_b)\n", + " \n", + " beta_used = beta if diag == slices - 1 else 1.0\n", + " axpby(alpha, acc_fp64_frag, beta_used, c_fp64_frag)\n", + " accumulator.partition_and_copy(c_fp64_frag, gmem_c)\n", + "\n", + " tile_pipeline._del()\n", + "\n", + " return dgemm_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a6f15d9-e6fd-43ca-8660-216d51f1bf69", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_func_solution(m, n, k):\n", + " ctx = setup_func(m, n, k)\n", + " BLAS = ctx[\"BLAS\"]\n", + " ctx[\"gemm_kernel\"] = get_emulated_dgemm_kernel_solution(BLAS)\n", + " return ctx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a16387c-8906-4c5a-ad98-10713ce0d215", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_fused_emulated_dgemm(problems, setup_func_solution, fused_dgemm_ozaki)" + ] + }, + { + "cell_type": "markdown", + "id": "446bb3ae-e948-4833-b8b0-eb07835f6ba3", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this notebook, we finished fusing the epilogue kernel into our emulated gemm kernel. The core technique needed was an API to get our relative coordinates within the tile and use that to make element specific updates. From there, we kept utilizing tile-based API's for efficiency and simplicity." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/CMakeLists.txt b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/CMakeLists.txt new file mode 100644 index 00000000..a399e06e --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 4.0) + +LIST(APPEND CMAKE_PROGRAM_PATH "/usr/local/cuda-13.1/bin") +project(cublasdx-dgemm-notebook-1 VERSION 0.1 LANGUAGES CUDA CXX) + +# Add header tutorial helper files +add_library(tutorial_helpers INTERFACE) +target_include_directories(tutorial_helpers INTERFACE ../../cpp_source/include/) + +include(../../cmake/tutorial.cmake) + +add_tutorial(2a_unfused_emulation cpp/2a_unfused_emulation/dgemm_emulation.cu) +add_tutorial(2b_partially_fused_emulation cpp/2b_partially_fused_emulation/dgemm_emulation.cu) +add_tutorial(2c_fully_fused_emulation cpp/2c_fully_fused_emulation/dgemm_emulation.cu) diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Flowchart.png b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Flowchart.png new file mode 100644 index 00000000..f42422d3 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Flowchart.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:190bd752f97edced5238606b45b68b2cdeabc9a3a8b12942590b2d6161146ffe +size 53833 diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Multiplications-Fused.png b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Multiplications-Fused.png new file mode 100644 index 00000000..eebf4e35 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Multiplications-Fused.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40a5cfd4ee02c6d8e6a9a9d253bd74977a5fef4220ef8b6cae5f700cc566d665 +size 346322 diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Multiplications-Partial-Fusion.png b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Multiplications-Partial-Fusion.png new file mode 100644 index 00000000..af1eb8ca --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Multiplications-Partial-Fusion.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de02d45b67fc2567f023bb5ddc1f94ebf633e8df27d9e90031dae903d07f7ab5 +size 361844 diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Multiplications.png b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Multiplications.png new file mode 100644 index 00000000..637689b5 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/Images/Ozaki-I-Multiplications.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21bc0ab23e5ce4ee2c4009e0fb68dd46bb2863c91808f6b86e57952a8722929d +size 419487 diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/.gitignore b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/.gitignore new file mode 100644 index 00000000..163d4aec --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/.gitignore @@ -0,0 +1 @@ +**/*.hpp.inc diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2a_unfused_emulation/dgemm_emulation.cu b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2a_unfused_emulation/dgemm_emulation.cu new file mode 100644 index 00000000..02cf07f7 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2a_unfused_emulation/dgemm_emulation.cu @@ -0,0 +1,362 @@ +// std libraries +#include + +// cuda std libraries +#include +#include + +// cuda libraries +#include + +// utility headers +#include +#include +#include + +#include "slicing.hpp" +#include "emulation_kernels.hpp" + +// This example demonstrates the Ozaki scheme for emulating double precision GEMM +// using multiple lower precision GEMM operations. The Ozaki scheme works by: +// 1. Decomposing double precision matrices into multiple int8_t "slices" +// 2. Performing GEMM on each combination of slices +// 3. Reconstructing the final double precision result +// +// Mathematical foundation: +// For double precision values a and b, we can represent them as: +// a = Σ(i=0 to slices-1) a_i * 2^(shift_a - i*bits_per_slice) +// b = Σ(j=0 to slices-1) b_j * 2^(shift_b - j*bits_per_slice) +// +// Then a*b = ΣΣ a_i * b_j * 2^(shift_a + shift_b - (i+j)*bits_per_slice) +// +// This allows us to compute the product using multiple int8_t GEMM operations +// and then combine the results with appropriate scaling. + +// Main cuBLASDx DGEMM emulation function using Ozaki scheme +// This function orchestrates the entire emulation process: +// 1. Preprocessing: Extract scaling factors from input matrices +// 2. Slicing: Decompose double precision matrices into int8_t slices +// 3. Matrix multiplication: Perform GEMM on slice combinations +// 4. Reconstruction: Combine results back to double precision +template +auto run_tutorial_kernel(double alpha, + ATensor const& tensor_a, + BTensor const& tensor_b, + double beta, + CTensor const& tensor_c, + cudaStream_t stream = 0, + unsigned warm_up_runs = 10, + unsigned kernel_runs = 100, + bool debug = false) { + + float total_time = 0.f; + + /* ============================================================== */ + /* OZAKI SCHEME STEP 1: SETUP */ + /* Prepare slice tensors */ + /* ============================================================== */ + // Verify that tile dimensions divide evenly into problem dimensions + + // Each slice represents a portion of the original double precision values + + using slice_value_type = typename BLAS::a_value_type; + using accumulator_value_type = typename BLAS::c_value_type; + + // Number of slices per elements + auto const static_slices = cuda::std::integral_constant {}; + // Total number of slice matrix multiplications + // (number of elements in diag-inclusive lower triangle of matrix with both dimensions == Slices) + auto const static_slice_products = cuda::std::integral_constant {}; + + // Create slice tensor A: [m, k, slices] - stores int8_t slices of matrix A + auto const [shape_a_rows_, shape_a_cols_] = tensor_a.layout().shape(); + int const shape_a_rows = shape_a_rows_; + int const shape_a_cols = shape_a_cols_; + constexpr auto arr_a = cublasdx::arrangement_of_v_a; + auto d_slice_a_storage = + tutorial::get_empty_device_tensor(shape_a_rows, shape_a_cols, static_slices); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_slice_a = cuda::std::get<1>(d_slice_a_storage); + + + // Create slice tensor B: [k, n, slices] - stores int8_t slices of matrix B + auto const [shape_b_rows_, shape_b_cols_] = tensor_b.layout().shape(); + int const shape_b_rows = shape_b_rows_; + int const shape_b_cols = shape_b_cols_; + constexpr auto arr_b = cublasdx::arrangement_of_v_b; + auto d_slice_b_storage = + tutorial::get_empty_device_tensor(shape_b_rows, shape_b_cols, static_slices); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_slice_b = cuda::std::get<1>(d_slice_b_storage); + + + // Create slice tensor C: [m, n, slice_products] - stores int32_t slices of matrix C + auto const [shape_c_rows_, shape_c_cols_] = tensor_c.layout().shape(); + int const shape_c_rows = shape_c_rows_; + int const shape_c_cols = shape_c_cols_; + constexpr auto arr_c = cublasdx::arrangement_of_v_c; + auto d_products_storage = + tutorial::get_empty_device_tensor(shape_c_rows, shape_c_cols, static_slice_products); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_products = cuda::std::get<1>(d_products_storage); + + + /* ============================================================== */ + /* OZAKI SCHEME STEP 2: PREPROCESSING */ + /* Extract max exponent of rows(A) and cols(B) */ + /* ============================================================== */ + + // The Ozaki scheme requires finding the maximum absolute value in each + // row of A and each column of B to determine appropriate scaling factors. + // These scaling factors ensure that when we slice the double precision + // values into int8_t components, we don't lose significant precision. + + using shift_t = int32_t; + constexpr auto shift_arr = cublasdx::col_major; + + // Create tensors for the shift values with proper tiling structure + auto const static_tile_m = cuda::std::integral_constant> {}; + auto d_shift_a_storage = + tutorial::get_empty_device_tensor(static_tile_m, shape_a_rows / static_tile_m()); + auto tensor_shift_a = cuda::std::get<1>(d_shift_a_storage); + + auto const static_tile_n = cuda::std::integral_constant> {}; + auto d_shift_b_storage = + tutorial::get_empty_device_tensor(static_tile_n, shape_b_cols / static_tile_n()); + auto tensor_shift_b = cuda::std::get<1>(d_shift_b_storage); + + // Execute preprocessing kernels to find maximum values and compute scaling factors + { + auto run_preprocessing = [&](auto str) { + constexpr int reduction_block_size = 64; + // Find max absolute value in each row of A and convert to exponent shift + max_reduce_kernel + <<>>(tensor_a, tensor_shift_a); + // Find max absolute value in each column of B and convert to exponent shift + max_reduce_kernel + <<>>(tensor_b, tensor_shift_b); + }; + + auto time_ms = tutorial::measure::execution(run_preprocessing, warm_up_runs, kernel_runs, stream); + + total_time += time_ms; + if (debug) { + std::cout << "----> Custom Preprocess time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 3: SLICING */ + /* Slice up input A and B matrices */ + /* ============================================================== */ + + // This step decomposes each double precision value into multiple int8_t slices. + // For a double precision value x with scaling factor s, we create slices such that: + // x ≈ Σ(i=0 to slices-1) slice_i * 2^(s - i*8) + // where each slice_i is an int8_t value. + + { + + auto run_slicing = [&](auto str) { + constexpr auto slice_kernel_block_size = 64; + // Slice matrix A: each double precision element becomes slices int8_t values + slice_kernel + <<>>( + tensor_a, tensor_shift_a, tensor_slice_a, shape_a_cols); + // Slice matrix B: each double precision element becomes slices int8_t values + slice_kernel + <<>>( + tensor_b, tensor_shift_b, tensor_slice_b, shape_a_cols); + }; + + auto time_ms = tutorial::measure::execution(run_slicing, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Slice time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 4: MATRIX MULTIPLICATION */ + /* Product of slices */ + /* ============================================================== */ + + // This is the core of the Ozaki scheme. We need to compute the product: + // C = A * B = (Σ A_i * 2^shift_A_i) * (Σ B_j * 2^shift_B_j) + // = ΣΣ A_i * B_j * 2^(shift_A_i + shift_B_j) + // + // We compute this as multiple GEMM operations between slice combinations, + // with each result scaled appropriately and accumulated into the final result. + + { + auto run_unfused_matmul = [&](auto str) { + #include "slice_coordination.hpp.inc" + }; + + auto time_ms = tutorial::measure::execution(run_unfused_matmul, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Matmul time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 5: EPILOGUE */ + /* Accumulate Diagonals */ + /* ============================================================== */ + + { + #include "epilogue_config.hpp.inc" + + dim3 grid(shape_a_rows / epilogue_kernel_tile_m, shape_b_cols / epilogue_kernel_tile_n); + dim3 block(epilogue_kernel_tile_m, epilogue_kernel_tile_n); + + auto dummy_c_storage = tutorial::get_copy_tensor(tensor_c); + auto dummy_tensor_c = cuda::std::get<1>(dummy_c_storage); + + auto run_epilogue = [&](auto str) { + epilogue_kernel + <<>>(alpha, beta, tensor_products, tensor_shift_a, tensor_shift_b, dummy_tensor_c); + }; + + auto time_ms = tutorial::measure::execution(run_epilogue, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Epilogue time: " << time_ms << " ms" << std::endl; + } + + // Run correctness check + epilogue_kernel + <<>>(alpha, beta, tensor_products, tensor_shift_a, tensor_shift_b, tensor_c); + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + std::vector results(tensor_c.size()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(results.data(), + tutorial::raw_pointer_cast(tensor_c.data()), + tensor_c.size() * sizeof(double), + cudaMemcpyDeviceToHost)); + + // performance runs + auto avg_tflops = tutorial::real_gemm_tflops(shape_a_rows, shape_b_cols, shape_a_cols) / total_time; + return cuda::std::make_tuple(total_time, avg_tflops, results); +} + +int main(int argc, char** argv) { + using alpha_value_type = double; + using beta_value_type = double; + + constexpr auto arrangement_a = cublasdx::row_major; + constexpr auto arrangement_b = cublasdx::col_major; + constexpr auto arrangement_c = cublasdx::col_major; + + int const warm_up_runs = 10; + int const kernel_runs = 100; + + #include "parameters.hpp.inc" + + bool const debug = false; + + for (tutorial::gemm_problem_t problem : problems) { + int const m = problem.m; + int const n = problem.n; + int const k = problem.k; + double const alpha = problem.alpha; + double const beta = problem.beta; + + std::cout << "Computing GEMM M=" << m << " N=" << n << " K=" << k << " (slices=" << slices << ")\n"; + + // =================================== + // Ozaki scheme configuration + // =================================== + + #include "cublasdx_config.hpp.inc" + + if (m % tile_m != 0 or n % tile_n != 0 or k % tile_k != 0) { + std::cerr << "Problem shape must be divisible by tile shape" << std::endl; + exit(-1); + } + + // =================================== + // Data type definitions + // =================================== + + using a_value_type = double; + using b_value_type = double; + using c_value_type = double; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + if (debug) { + tutorial::print_device_properties(); + } + + /* ============================================================== */ + /* Input FP64 (host) tensors */ + /* ============================================================== */ + static const float range_lower_bound = 1.0f / 3.14f; + static const float range_upper_bound = 52.0f / 3.14f; + int seed = 1234; + constexpr tutorial::random_distribution dist = tutorial::random_distribution::uniform; + + auto [vector_a, tensor_a] = tutorial::get_random_device_tensor( + m, k, range_lower_bound, range_upper_bound, seed); + auto [vector_b, tensor_b] = tutorial::get_random_device_tensor( + k, n, range_lower_bound, range_upper_bound, seed + 1); + + auto [vector_c_custom, tensor_c_custom] = tutorial::get_random_device_tensor( + m, n, range_lower_bound, range_upper_bound, seed + 2); + auto [vector_c_reference, tensor_c_reference] = tutorial::get_copy_tensor(tensor_c_custom); + + /* ============================================================== */ + /* Compute Reference Result */ + /* ============================================================== */ + auto [time_reference, tflops_reference, results_reference] = tutorial::cublaslt_reference( + alpha, tensor_a, tensor_b, beta, tensor_c_reference, stream, warm_up_runs, kernel_runs); + + + /* ============================================================== */ + /* Compute Emulation Result */ + /* ============================================================== */ + auto [time_tutorial, tflops_tutorial, results_tutorial] = run_tutorial_kernel( + alpha, tensor_a, tensor_b, beta, tensor_c_custom, stream, warm_up_runs, kernel_runs, debug); + + /* ========================================================================================= */ + /* Print summary of performance and correctness results */ + /* ========================================================================================= */ + std::cout << "\nCustom Emulation Kernel (unfused)\n"; + std::cout << std::fixed << std::setprecision(4); + std::cout << "Avg time [ms] = " << time_tutorial << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_tutorial << "\n"; + + std::cout << "\ncuBLASLt (not including heuristic)\n"; + std::cout << "Avg time [ms] = " << time_reference << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_reference << "\n\n"; + + constexpr bool verbose_knob = false; + constexpr bool print_knob = true; + + auto error = tutorial::calculate_error(results_tutorial, results_reference, verbose_knob, print_knob); + std::cout << std::fixed << std::setprecision(10) << "Total relative error = " << error << "\n"; + + std::cout << std::fixed << std::setprecision(2) << (tflops_tutorial / tflops_reference) * 100 + << "% reference performance \n\n"; + } + + return 0; +} diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2a_unfused_emulation/emulation_kernels.hpp b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2a_unfused_emulation/emulation_kernels.hpp new file mode 100644 index 00000000..5cce41ca --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2a_unfused_emulation/emulation_kernels.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include +#include +#include + +#include +using namespace cublasdx; + +#include + +#include +#include "slicing.hpp" + +enum class slice_matrix +{ + a, + b +}; + +template +__launch_bounds__(BlockSize, 2) __global__ void max_reduce_kernel(InTensor in_tensor, OutTensor out_tensor) { + using datatype = tutorial::tensor_value_type_t; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + const auto [tile_size_x, tile_size_y] = in_tensor.layout().shape(); + auto tid = threadIdx.x; + auto bid = blockIdx.x; + + // Assume that tensor is reduced along the last dimension + auto const row_index = tutorial::conditional_return(bid, cublasdx::slice); + auto const col_index = tutorial::conditional_return(cublasdx::slice, bid); + + auto global_tile = in_tensor(row_index, col_index); + + // 1. Find local maximum absolute value for this thread + double local_max = 0; + + auto const length = (SliceMatrix == slice_matrix::a) ? tile_size_y : tile_size_x; + for (auto i = tid; i < length; i += BlockSize) { + local_max = cuda::std::max(local_max, cuda::std::abs(global_tile(i))); + } + + // 2. Compute block-wide reduction to find maximum across all threads + __syncthreads(); + const double block_max = BlockReduce(temp_storage).Reduce(local_max, [](const auto& a, const auto& b) { + return cuda::std::max(a, b); + }); + + // 3. Convert maximum value to exponent shift and store to global memory + // This shift determines the scaling factor for slicing this row/column + if (tid == 0) { + out_tensor(bid) = max_to_exponent_shift(block_max); + } +} + + +template +__launch_bounds__(BlockSize, 2) __global__ + void slice_kernel(InTensor in_tensor, ShiftTensor shift_tensor, OutTensor out_tensor, int32_t reduction_dim_size) { + using in_datatype = tutorial::tensor_value_type_t; + using out_datatype = tutorial::tensor_value_type_t; + + const auto tid = threadIdx.x + blockIdx.x * BlockSize; + + // Calculate which matrix element this thread processes + auto slow_idx = tid / reduction_dim_size; + auto fast_idx = tid % reduction_dim_size; + + auto const row_idx = (SliceMatrix == slice_matrix::a) ? slow_idx : fast_idx; + auto const col_idx = (SliceMatrix == slice_matrix::a) ? fast_idx : slow_idx; + + // Decompose the double precision value into multiple int8_t slices + // using the appropriate scaling factor for this row/column + const cuda::std::array slices = + slices_from_fp64(in_tensor(row_idx, col_idx), shift_tensor(slow_idx)); + +// Store all slices for this matrix element +#pragma unroll + for (int elem = 0; elem < Slices; ++elem) { + out_tensor(row_idx, col_idx, elem) = slices[elem]; + } +} + +#include "igemm_kernel.hpp.inc" + +#include "epilogue_kernel.hpp.inc" + diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2a_unfused_emulation/slicing.hpp b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2a_unfused_emulation/slicing.hpp new file mode 100644 index 00000000..c39cbfcd --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2a_unfused_emulation/slicing.hpp @@ -0,0 +1,221 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#define SLICING_FUNCTION __host__ __device__ __forceinline__ + +union double_structure { + double d; + struct float64 { + unsigned int mantissa_lo : 32; + unsigned int mantissa_hi : 20; + unsigned int exponent : 11; + unsigned int sign : 1; + } s; +}; + +static constexpr int bias = 1023; + +/* + * Signed magnitudes of length N only allow for (N-1) of effective storage. + */ +template +SLICING_FUNCTION constexpr int get_width() { + if constexpr (cuda::std::is_signed()) { + return 8 * sizeof(T) - 1; + } else { + return 8 * sizeof(T); + }; +} + +SLICING_FUNCTION int64_t div_up(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + +SLICING_FUNCTION int rz_width(const double_structure& em) { + return em.s.exponent + 1 - bias; +} +// Length of a bits before the decimal point. i.e., bit width if casted to infinite-length int type. +SLICING_FUNCTION int rz_width(const double d) { + double_structure em {d}; + return rz_width(em); +} + +SLICING_FUNCTION constexpr int64_t ipow_p(int64_t base, int exp, int64_t ans = 1) { + return exp < 1 ? ans : ipow_p(base * base, exp / 2, (exp % 2) ? ans * base : ans); +} + +SLICING_FUNCTION constexpr double ipow(int base, int exp) { + return exp > 0 ? ipow_p(base, exp) : 1.0 / ipow_p(base, -exp); +} + +template +SLICING_FUNCTION constexpr int max_exponent(); + +// scale to numbers no bigger than 256 +template<> +SLICING_FUNCTION constexpr int max_exponent() { + return 8; +} + +// scale to numbers no bigger than 128 +template<> +SLICING_FUNCTION constexpr int max_exponent() { + return 7; +} + +SLICING_FUNCTION int32_t get_exponent(double val) { + double_structure em = {val}; + + int em_exponent = (em.s.exponent + 1 - bias); + + if (em.s.mantissa_hi & (63 << 14) == (63 << 14)) + em_exponent++; + + return em_exponent; +} + +// An implementation of ldexp() to be used in scaling double-precision numbers obtained from unpacking slices +// in the epilogue. The resulting double values must be finite and normalized, and so the fast path should +// simply adjust the exponent field of the value, so long as the result is also finite and normalized. +SLICING_FUNCTION void epilogue_ldexp(double_structure& em, int exp) { + static constexpr int exp_max = bias - 1; + int previous_exp_biased = static_cast(em.s.exponent); + if (0 < previous_exp_biased && 0 < previous_exp_biased + exp && previous_exp_biased + exp <= exp_max + bias) { + em.s.exponent += exp; + return; + } + em.d = ldexp(em.d, exp); +} + +/* + * Returns the exponent shift to be applied to a row/colum + * based on the max(abs()) on that row/column. + * Naively, this scaling factor would be just the exponent + * of max(abs()) but we do some other tricks to account for + * the encoding of the signed magnitude only on the leading + * slice among other things.. + */ +SLICING_FUNCTION int32_t max_to_exponent_shift(double row_col_max) { + static constexpr int scale_max_exponent = max_exponent(); + + return scale_max_exponent - get_exponent(row_col_max); +} + +/* + * slices up the input value "val" in "nslices" of type "SliceValueType". + * Before slicing the number, the exponent of "val" is shifted based on + * the value of "exponent_shift" which has been computed using the + * max_to_exponent_shift function based on the max(abs()) of the relevant + * row/column of A/B. + * + * On exit, the first value of the returned array contains the most + * significant slice. + */ +template +SLICING_FUNCTION cuda::std::array slices_from_fp64(double val, int32_t exponent_shift) { + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + + cuda::std::array slices = {0}; + + static constexpr double normalization_factor = 0x1.0p52; + // Normalise denormalised numbers, but store the effective exponent in its own variable, + // allowing for representation of fp64 denorms as normalised numbers. + + int skip_slices = 0; + int64_t r = 0; + + uint8_t reg_pack = 0; + + double_structure r0 = {val}; + int denorm_compensation = 0; + if (r0.s.exponent == 0) { + if (r0.d == 0.0) { + skip_slices = nslices; + r = 0; + } else { + /* round to nearest is the default behavior on CPU */ + r0.d = (r0.d * normalization_factor); + denorm_compensation = -52; + } + } + int exp = r0.s.exponent + exponent_shift + denorm_compensation - bias; + exp += (nslices - 1) * get_width(); // Use all 8 bits. + + // Adjust casting range. + int extra_width = (exp + 1) - 63; + extra_width = extra_width > 0 ? extra_width : 0; + skip_slices = div_up(extra_width, get_width()); + exp -= skip_slices * get_width(); + + // Handle exp outside of double range + if (exp < 0) { + r = 0; + } else { + r0.s.exponent = (unsigned int)(exp + bias); + r = static_cast(r0.d); + } + + for (int64_t _i = 0; _i < nslices; _i++) { + int64_t i = nslices - 1 - _i; + + if (_i < skip_slices) { + reg_pack = 0; + } else { + reg_pack = static_cast(r); + slices[i] = static_cast(reg_pack); + r = (r >> get_width()) + (reg_pack >> get_width()); + } + } + + return slices; +} + +/* + * This function is a building block to reconstruct an FP64 number from the slices. + * Instead of receiving the set of slices and adding them to an FP64 number, + * this function gets a single slice (the NTH-slice) and returns it as an FP64 value. + * + * In this way, one could use this function to compute and accumulate the contributions + * from the slices separately. + * + * Remark: when reconstructing an FP64 number accumulate the least significant + * diagonals first to avoid catastrophic cancellation. + */ +template +SLICING_FUNCTION double nth_slice_to_fp64(int32_t nth, DiagonalAccType nth_slice, int32_t exponent_shift) { + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + assert(nth >= 0); + + /* In some instances, we use the unsigned value type to leverage all bits for storage */ + double ko = pow(2.0, -get_width>() * nth); + + double value_i = ko * static_cast(nth_slice); + double_structure value = {value_i}; + epilogue_ldexp(value, -exponent_shift); + return value.d; +} diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2b_partially_fused_emulation/dgemm_emulation.cu b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2b_partially_fused_emulation/dgemm_emulation.cu new file mode 100644 index 00000000..040608de --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2b_partially_fused_emulation/dgemm_emulation.cu @@ -0,0 +1,367 @@ +// std libraries +#include + +// cuda std libraries +#include +#include + +// cuda libraries +#include + +// utility headers +#include +#include +#include + +#include "slicing.hpp" +#include "emulation_kernels.hpp" + +// This example demonstrates the Ozaki scheme for emulating double precision GEMM +// using multiple lower precision GEMM operations. The Ozaki scheme works by: +// 1. Decomposing double precision matrices into multiple int8_t "slices" +// 2. Performing GEMM on each combination of slices +// 3. Reconstructing the final double precision result +// +// Mathematical foundation: +// For double precision values a and b, we can represent them as: +// a = Σ(i=0 to slices-1) a_i * 2^(shift_a - i*bits_per_slice) +// b = Σ(j=0 to slices-1) b_j * 2^(shift_b - j*bits_per_slice) +// +// Then a*b = ΣΣ a_i * b_j * 2^(shift_a + shift_b - (i+j)*bits_per_slice) +// +// This allows us to compute the product using multiple int8_t GEMM operations +// and then combine the results with appropriate scaling. + +// Main cuBLASDx DGEMM emulation function using Ozaki scheme +// This function orchestrates the entire emulation process: +// 1. Preprocessing: Extract scaling factors from input matrices +// 2. Slicing: Decompose double precision matrices into int8_t slices +// 3. Matrix multiplication: Perform GEMM on slice combinations +// 4. Reconstruction: Combine results back to double precision +template +auto run_tutorial_kernel(double alpha, + ATensor const& tensor_a, + BTensor const& tensor_b, + double beta, + CTensor const& tensor_c, + cudaStream_t stream = 0, + unsigned warm_up_runs = 10, + unsigned kernel_runs = 100, + bool debug = false) { + + float total_time = 0.f; + + /* ============================================================== */ + /* OZAKI SCHEME STEP 1: SETUP */ + /* Prepare slice tensors */ + /* ============================================================== */ + // Verify that tile dimensions divide evenly into problem dimensions + + // Each slice represents a portion of the original double precision values + + using slice_value_type = typename BLAS::a_value_type; + using accumulator_value_type = typename BLAS::c_value_type; + + // Number of slices per elements + auto const static_slices = cuda::std::integral_constant {}; + + // Create slice tensor A: [m, k, slices] - stores int8_t slices of matrix A + auto const [shape_a_rows_, shape_a_cols_] = tensor_a.layout().shape(); + int const shape_a_rows = shape_a_rows_; + int const shape_a_cols = shape_a_cols_; + constexpr auto arr_a = cublasdx::arrangement_of_v_a; + auto d_slice_a_storage = + tutorial::get_empty_device_tensor(shape_a_rows, shape_a_cols, static_slices); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_slice_a = cuda::std::get<1>(d_slice_a_storage); + + + // Create slice tensor B: [k, n, slices] - stores int8_t slices of matrix B + auto const [shape_b_rows_, shape_b_cols_] = tensor_b.layout().shape(); + int const shape_b_rows = shape_b_rows_; + int const shape_b_cols = shape_b_cols_; + constexpr auto arr_b = cublasdx::arrangement_of_v_b; + auto d_slice_b_storage = + tutorial::get_empty_device_tensor(shape_b_rows, shape_b_cols, static_slices); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_slice_b = cuda::std::get<1>(d_slice_b_storage); + + + // Create slice tensor C: [m, n, slice_products] - stores int32_t slices of matrix C + auto const [shape_c_rows_, shape_c_cols_] = tensor_c.layout().shape(); + int const shape_c_rows = shape_c_rows_; + int const shape_c_cols = shape_c_cols_; + constexpr auto arr_c = cublasdx::arrangement_of_v_c; + auto d_products_storage = + tutorial::get_empty_device_tensor(shape_c_rows, shape_c_cols, static_slices); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_products = cuda::std::get<1>(d_products_storage); + + + /* ============================================================== */ + /* OZAKI SCHEME STEP 2: PREPROCESSING */ + /* Extract max exponent of rows(A) and cols(B) */ + /* ============================================================== */ + + // The Ozaki scheme requires finding the maximum absolute value in each + // row of A and each column of B to determine appropriate scaling factors. + // These scaling factors ensure that when we slice the double precision + // values into int8_t components, we don't lose significant precision. + + using shift_t = int32_t; + constexpr auto shift_arr = cublasdx::col_major; + + // Create tensors for the shift values with proper tiling structure + auto const static_tile_m = cuda::std::integral_constant> {}; + auto d_shift_a_storage = + tutorial::get_empty_device_tensor(static_tile_m, shape_a_rows / static_tile_m()); + auto tensor_shift_a = cuda::std::get<1>(d_shift_a_storage); + + auto const static_tile_n = cuda::std::integral_constant> {}; + auto d_shift_b_storage = + tutorial::get_empty_device_tensor(static_tile_n, shape_b_cols / static_tile_n()); + auto tensor_shift_b = cuda::std::get<1>(d_shift_b_storage); + + // Execute preprocessing kernels to find maximum values and compute scaling factors + { + auto run_preprocessing = [&](auto str) { + constexpr int reduction_block_size = 64; + // Find max absolute value in each row of A and convert to exponent shift + max_reduce_kernel + <<>>(tensor_a, tensor_shift_a); + // Find max absolute value in each column of B and convert to exponent shift + max_reduce_kernel + <<>>(tensor_b, tensor_shift_b); + }; + + auto time_ms = tutorial::measure::execution(run_preprocessing, warm_up_runs, kernel_runs, stream); + + total_time += time_ms; + if (debug) { + std::cout << "----> Custom Preprocess time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 3: SLICING */ + /* Slice up input A and B matrices */ + /* ============================================================== */ + + // This step decomposes each double precision value into multiple int8_t slices. + // For a double precision value x with scaling factor s, we create slices such that: + // x ≈ Σ(i=0 to slices-1) slice_i * 2^(s - i*8) + // where each slice_i is an int8_t value. + + { + + auto run_slicing = [&](auto str) { + constexpr auto slice_kernel_block_size = 64; + // Slice matrix A: each double precision element becomes slices int8_t values + slice_kernel + <<>>( + tensor_a, tensor_shift_a, tensor_slice_a, shape_a_cols); + // Slice matrix B: each double precision element becomes slices int8_t values + slice_kernel + <<>>( + tensor_b, tensor_shift_b, tensor_slice_b, shape_a_cols); + }; + + auto time_ms = tutorial::measure::execution(run_slicing, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Slice time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 4: MATRIX MULTIPLICATION */ + /* Product of slices */ + /* ============================================================== */ + + // This is the core of the Ozaki scheme. We need to compute the product: + // C = A * B = (Σ A_i * 2^shift_A_i) * (Σ B_j * 2^shift_B_j) + // = ΣΣ A_i * B_j * 2^(shift_A_i + shift_B_j) + // + // We compute this as multiple GEMM operations between slice combinations, + // with each result scaled appropriately and accumulated into the final result. + + { + #include "pipeline_config.hpp.inc" + + dim3 grid(shape_a_rows / static_tile_m(), shape_b_cols / static_tile_n()); + auto kernel = fused_epilogue_kernel; + auto shared_memory_size = device_pipeline.buffer_size(); + CUDA_CHECK_AND_EXIT( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + + auto run_fused_matmul = [&](auto str) { + kernel<<>>(device_pipeline, + tensor_products); + }; + + auto time_ms = tutorial::measure::execution(run_fused_matmul, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Matmul time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 5: EPILOGUE */ + /* Accumulate Diagonals */ + /* ============================================================== */ + + { + #include "epilogue_config.hpp.inc" + + dim3 grid(shape_a_rows / epilogue_kernel_tile_m, shape_b_cols / epilogue_kernel_tile_n); + dim3 block(epilogue_kernel_tile_m, epilogue_kernel_tile_n); + + auto dummy_c_storage = tutorial::get_copy_tensor(tensor_c); + auto dummy_tensor_c = cuda::std::get<1>(dummy_c_storage); + + auto run_epilogue = [&](auto str) { + epilogue_kernel + <<>>(alpha, beta, tensor_products, tensor_shift_a, tensor_shift_b, dummy_tensor_c); + }; + + auto time_ms = tutorial::measure::execution(run_epilogue, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Epilogue time: " << time_ms << " ms" << std::endl; + } + + epilogue_kernel + <<>>(alpha, beta, tensor_products, tensor_shift_a, tensor_shift_b, tensor_c); + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + std::vector results(tensor_c.size()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(results.data(), + tutorial::raw_pointer_cast(tensor_c.data()), + tensor_c.size() * sizeof(double), + cudaMemcpyDeviceToHost)); + + // performance runs + auto avg_tflops = tutorial::real_gemm_tflops(shape_a_rows, shape_b_cols, shape_a_cols) / total_time; + return cuda::std::make_tuple(total_time, avg_tflops, results); +} + +int main(int argc, char** argv) { + using alpha_value_type = double; + using beta_value_type = double; + + constexpr auto arrangement_a = cublasdx::row_major; + constexpr auto arrangement_b = cublasdx::col_major; + constexpr auto arrangement_c = cublasdx::col_major; + + int const warm_up_runs = 10; + int const kernel_runs = 100; + + #include "parameters.hpp.inc" + + bool const debug = false; + + for (tutorial::gemm_problem_t problem : problems) { + int const m = problem.m; + int const n = problem.n; + int const k = problem.k; + double const alpha = problem.alpha; + double const beta = problem.beta; + + std::cout << "Computing GEMM M=" << m << " N=" << n << " K=" << k << " (slices=" << slices << ")\n"; + + // =================================== + // Ozaki scheme configuration + // =================================== + + #include "cublasdx_config.hpp.inc" + + if (m % tile_m != 0 or n % tile_n != 0 or k % tile_k != 0) { + std::cerr << "Problem shape must be divisible by tile shape" << std::endl; + exit(-1); + } + + // =================================== + // Data type definitions + // =================================== + + using a_value_type = double; + using b_value_type = double; + using c_value_type = double; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + if (debug) { + tutorial::print_device_properties(); + } + + /* ============================================================== */ + /* Input FP64 (host) tensors */ + /* ============================================================== */ + static const float range_lower_bound = 1.0f / 3.14f; + static const float range_upper_bound = 52.0f / 3.14f; + int seed = 1234; + constexpr tutorial::random_distribution dist = tutorial::random_distribution::uniform; + + auto [vector_a, tensor_a] = tutorial::get_random_device_tensor( + m, k, range_lower_bound, range_upper_bound, seed); + auto [vector_b, tensor_b] = tutorial::get_random_device_tensor( + k, n, range_lower_bound, range_upper_bound, seed + 1); + + auto [vector_c_custom, tensor_c_custom] = tutorial::get_random_device_tensor( + m, n, range_lower_bound, range_upper_bound, seed + 2); + auto [vector_c_reference, tensor_c_reference] = tutorial::get_copy_tensor(tensor_c_custom); + + /* ============================================================== */ + /* Compute Reference Result */ + /* ============================================================== */ + auto [time_reference, tflops_reference, results_reference] = tutorial::cublaslt_reference( + alpha, tensor_a, tensor_b, beta, tensor_c_reference, stream, warm_up_runs, kernel_runs); + + + /* ============================================================== */ + /* Compute Emulation Result */ + /* ============================================================== */ + auto [time_tutorial, tflops_tutorial, results_tutorial] = run_tutorial_kernel( + alpha, tensor_a, tensor_b, beta, tensor_c_custom, stream, warm_up_runs, kernel_runs, debug); + + /* ========================================================================================= */ + /* Print summary of performance and correctness results */ + /* ========================================================================================= */ + std::cout << "\nCustom Emulation Kernel (partially fused)\n"; + std::cout << std::fixed << std::setprecision(4); + std::cout << "Avg time [ms] = " << time_tutorial << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_tutorial << "\n"; + + std::cout << "\ncuBLASLt (not including heuristic)\n"; + std::cout << "Avg time [ms] = " << time_reference << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_reference << "\n\n"; + + constexpr bool verbose_knob = false; + constexpr bool print_knob = true; + + auto error = tutorial::calculate_error(results_tutorial, results_reference, verbose_knob, print_knob); + std::cout << std::fixed << std::setprecision(10) << "Total relative error = " << error << "\n"; + + std::cout << std::fixed << std::setprecision(2) << (tflops_tutorial / tflops_reference) * 100 + << "% reference performance \n\n"; + } + + return 0; +} diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2b_partially_fused_emulation/emulation_kernels.hpp b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2b_partially_fused_emulation/emulation_kernels.hpp new file mode 100644 index 00000000..d6d51223 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2b_partially_fused_emulation/emulation_kernels.hpp @@ -0,0 +1,88 @@ +#pragma once + +#include +#include +#include + +#include +using namespace cublasdx; + +#include + +#include +#include "slicing.hpp" + +enum class slice_matrix +{ + a, + b +}; + +template +__launch_bounds__(BlockSize, 2) __global__ void max_reduce_kernel(InTensor in_tensor, OutTensor out_tensor) { + using datatype = tutorial::tensor_value_type_t; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + const auto [tile_size_x, tile_size_y] = in_tensor.layout().shape(); + auto tid = threadIdx.x; + auto bid = blockIdx.x; + + // Assume that tensor is reduced along the last dimension + auto const row_index = tutorial::conditional_return(bid, cublasdx::slice); + auto const col_index = tutorial::conditional_return(cublasdx::slice, bid); + + auto global_tile = in_tensor(row_index, col_index); + + // 1. Find local maximum absolute value for this thread + double local_max = 0; + + auto const length = (SliceMatrix == slice_matrix::a) ? tile_size_y : tile_size_x; + for (auto i = tid; i < length; i += BlockSize) { + local_max = cuda::std::max(local_max, cuda::std::abs(global_tile(i))); + } + + // 2. Compute block-wide reduction to find maximum across all threads + __syncthreads(); + const double block_max = BlockReduce(temp_storage).Reduce(local_max, [](const auto& a, const auto& b) { + return cuda::std::max(a, b); + }); + + // 3. Convert maximum value to exponent shift and store to global memory + // This shift determines the scaling factor for slicing this row/column + if (tid == 0) { + out_tensor(bid) = max_to_exponent_shift(block_max); + } +} + + +template +__launch_bounds__(BlockSize, 2) __global__ + void slice_kernel(InTensor in_tensor, ShiftTensor shift_tensor, OutTensor out_tensor, int32_t reduction_dim_size) { + using in_datatype = tutorial::tensor_value_type_t; + using out_datatype = tutorial::tensor_value_type_t; + + const auto tid = threadIdx.x + blockIdx.x * BlockSize; + + // Calculate which matrix element this thread processes + auto slow_idx = tid / reduction_dim_size; + auto fast_idx = tid % reduction_dim_size; + + auto const row_idx = (SliceMatrix == slice_matrix::a) ? slow_idx : fast_idx; + auto const col_idx = (SliceMatrix == slice_matrix::a) ? fast_idx : slow_idx; + + // Decompose the double precision value into multiple int8_t slices + // using the appropriate scaling factor for this row/column + const cuda::std::array slices = + slices_from_fp64(in_tensor(row_idx, col_idx), shift_tensor(slow_idx)); + +// Store all slices for this matrix element +#pragma unroll + for (int elem = 0; elem < Slices; ++elem) { + out_tensor(row_idx, col_idx, elem) = slices[elem]; + } +} + +#include "fused_kernel.hpp.inc" +#include "epilogue_kernel.hpp.inc" + diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2b_partially_fused_emulation/slicing.hpp b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2b_partially_fused_emulation/slicing.hpp new file mode 100644 index 00000000..c39cbfcd --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2b_partially_fused_emulation/slicing.hpp @@ -0,0 +1,221 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#define SLICING_FUNCTION __host__ __device__ __forceinline__ + +union double_structure { + double d; + struct float64 { + unsigned int mantissa_lo : 32; + unsigned int mantissa_hi : 20; + unsigned int exponent : 11; + unsigned int sign : 1; + } s; +}; + +static constexpr int bias = 1023; + +/* + * Signed magnitudes of length N only allow for (N-1) of effective storage. + */ +template +SLICING_FUNCTION constexpr int get_width() { + if constexpr (cuda::std::is_signed()) { + return 8 * sizeof(T) - 1; + } else { + return 8 * sizeof(T); + }; +} + +SLICING_FUNCTION int64_t div_up(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + +SLICING_FUNCTION int rz_width(const double_structure& em) { + return em.s.exponent + 1 - bias; +} +// Length of a bits before the decimal point. i.e., bit width if casted to infinite-length int type. +SLICING_FUNCTION int rz_width(const double d) { + double_structure em {d}; + return rz_width(em); +} + +SLICING_FUNCTION constexpr int64_t ipow_p(int64_t base, int exp, int64_t ans = 1) { + return exp < 1 ? ans : ipow_p(base * base, exp / 2, (exp % 2) ? ans * base : ans); +} + +SLICING_FUNCTION constexpr double ipow(int base, int exp) { + return exp > 0 ? ipow_p(base, exp) : 1.0 / ipow_p(base, -exp); +} + +template +SLICING_FUNCTION constexpr int max_exponent(); + +// scale to numbers no bigger than 256 +template<> +SLICING_FUNCTION constexpr int max_exponent() { + return 8; +} + +// scale to numbers no bigger than 128 +template<> +SLICING_FUNCTION constexpr int max_exponent() { + return 7; +} + +SLICING_FUNCTION int32_t get_exponent(double val) { + double_structure em = {val}; + + int em_exponent = (em.s.exponent + 1 - bias); + + if (em.s.mantissa_hi & (63 << 14) == (63 << 14)) + em_exponent++; + + return em_exponent; +} + +// An implementation of ldexp() to be used in scaling double-precision numbers obtained from unpacking slices +// in the epilogue. The resulting double values must be finite and normalized, and so the fast path should +// simply adjust the exponent field of the value, so long as the result is also finite and normalized. +SLICING_FUNCTION void epilogue_ldexp(double_structure& em, int exp) { + static constexpr int exp_max = bias - 1; + int previous_exp_biased = static_cast(em.s.exponent); + if (0 < previous_exp_biased && 0 < previous_exp_biased + exp && previous_exp_biased + exp <= exp_max + bias) { + em.s.exponent += exp; + return; + } + em.d = ldexp(em.d, exp); +} + +/* + * Returns the exponent shift to be applied to a row/colum + * based on the max(abs()) on that row/column. + * Naively, this scaling factor would be just the exponent + * of max(abs()) but we do some other tricks to account for + * the encoding of the signed magnitude only on the leading + * slice among other things.. + */ +SLICING_FUNCTION int32_t max_to_exponent_shift(double row_col_max) { + static constexpr int scale_max_exponent = max_exponent(); + + return scale_max_exponent - get_exponent(row_col_max); +} + +/* + * slices up the input value "val" in "nslices" of type "SliceValueType". + * Before slicing the number, the exponent of "val" is shifted based on + * the value of "exponent_shift" which has been computed using the + * max_to_exponent_shift function based on the max(abs()) of the relevant + * row/column of A/B. + * + * On exit, the first value of the returned array contains the most + * significant slice. + */ +template +SLICING_FUNCTION cuda::std::array slices_from_fp64(double val, int32_t exponent_shift) { + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + + cuda::std::array slices = {0}; + + static constexpr double normalization_factor = 0x1.0p52; + // Normalise denormalised numbers, but store the effective exponent in its own variable, + // allowing for representation of fp64 denorms as normalised numbers. + + int skip_slices = 0; + int64_t r = 0; + + uint8_t reg_pack = 0; + + double_structure r0 = {val}; + int denorm_compensation = 0; + if (r0.s.exponent == 0) { + if (r0.d == 0.0) { + skip_slices = nslices; + r = 0; + } else { + /* round to nearest is the default behavior on CPU */ + r0.d = (r0.d * normalization_factor); + denorm_compensation = -52; + } + } + int exp = r0.s.exponent + exponent_shift + denorm_compensation - bias; + exp += (nslices - 1) * get_width(); // Use all 8 bits. + + // Adjust casting range. + int extra_width = (exp + 1) - 63; + extra_width = extra_width > 0 ? extra_width : 0; + skip_slices = div_up(extra_width, get_width()); + exp -= skip_slices * get_width(); + + // Handle exp outside of double range + if (exp < 0) { + r = 0; + } else { + r0.s.exponent = (unsigned int)(exp + bias); + r = static_cast(r0.d); + } + + for (int64_t _i = 0; _i < nslices; _i++) { + int64_t i = nslices - 1 - _i; + + if (_i < skip_slices) { + reg_pack = 0; + } else { + reg_pack = static_cast(r); + slices[i] = static_cast(reg_pack); + r = (r >> get_width()) + (reg_pack >> get_width()); + } + } + + return slices; +} + +/* + * This function is a building block to reconstruct an FP64 number from the slices. + * Instead of receiving the set of slices and adding them to an FP64 number, + * this function gets a single slice (the NTH-slice) and returns it as an FP64 value. + * + * In this way, one could use this function to compute and accumulate the contributions + * from the slices separately. + * + * Remark: when reconstructing an FP64 number accumulate the least significant + * diagonals first to avoid catastrophic cancellation. + */ +template +SLICING_FUNCTION double nth_slice_to_fp64(int32_t nth, DiagonalAccType nth_slice, int32_t exponent_shift) { + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + assert(nth >= 0); + + /* In some instances, we use the unsigned value type to leverage all bits for storage */ + double ko = pow(2.0, -get_width>() * nth); + + double value_i = ko * static_cast(nth_slice); + double_structure value = {value_i}; + epilogue_ldexp(value, -exponent_shift); + return value.d; +} diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2c_fully_fused_emulation/dgemm_emulation.cu b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2c_fully_fused_emulation/dgemm_emulation.cu new file mode 100644 index 00000000..ccfe373b --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2c_fully_fused_emulation/dgemm_emulation.cu @@ -0,0 +1,356 @@ +// std libraries +#include + +// cuda std libraries +#include +#include + +// cuda libraries +#include + +// utility headers +#include +#include +#include + +#include "slicing.hpp" +#include "emulation_kernels.hpp" + +// This example demonstrates the Ozaki scheme for emulating double precision GEMM +// using multiple lower precision GEMM operations. The Ozaki scheme works by: +// 1. Decomposing double precision matrices into multiple int8_t "slices" +// 2. Performing GEMM on each combination of slices +// 3. Reconstructing the final double precision result +// +// Mathematical foundation: +// For double precision values a and b, we can represent them as: +// a = Σ(i=0 to slices-1) a_i * 2^(shift_a - i*bits_per_slice) +// b = Σ(j=0 to slices-1) b_j * 2^(shift_b - j*bits_per_slice) +// +// Then a*b = ΣΣ a_i * b_j * 2^(shift_a + shift_b - (i+j)*bits_per_slice) +// +// This allows us to compute the product using multiple int8_t GEMM operations +// and then combine the results with appropriate scaling. + +// Main cuBLASDx DGEMM emulation function using Ozaki scheme +// This function orchestrates the entire emulation process: +// 1. Preprocessing: Extract scaling factors from input matrices +// 2. Slicing: Decompose double precision matrices into int8_t slices +// 3. Matrix multiplication: Perform GEMM on slice combinations +// 4. Reconstruction: Combine results back to double precision +template +auto run_tutorial_kernel(double alpha, + ATensor const& tensor_a, + BTensor const& tensor_b, + double beta, + CTensor const& tensor_c, + cudaStream_t stream = 0, + unsigned warm_up_runs = 10, + unsigned kernel_runs = 100, + bool debug = false) { + + float total_time = 0.f; + + /* ============================================================== */ + /* OZAKI SCHEME STEP 1: SETUP */ + /* Prepare slice tensors */ + /* ============================================================== */ + // Verify that tile dimensions divide evenly into problem dimensions + + // Each slice represents a portion of the original double precision values + + using slice_value_type = typename BLAS::a_value_type; + using accumulator_value_type = typename BLAS::c_value_type; + + // Number of slices per elements + auto const static_slices = cuda::std::integral_constant {}; + + // Create slice tensor A: [m, k, slices] - stores int8_t slices of matrix A + auto const [shape_a_rows_, shape_a_cols_] = tensor_a.layout().shape(); + int const shape_a_rows = shape_a_rows_; + int const shape_a_cols = shape_a_cols_; + constexpr auto arr_a = cublasdx::arrangement_of_v_a; + auto d_slice_a_storage = + tutorial::get_empty_device_tensor(shape_a_rows, shape_a_cols, static_slices); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_slice_a = cuda::std::get<1>(d_slice_a_storage); + + + // Create slice tensor B: [k, n, slices] - stores int8_t slices of matrix B + auto const [shape_b_rows_, shape_b_cols_] = tensor_b.layout().shape(); + int const shape_b_rows = shape_b_rows_; + int const shape_b_cols = shape_b_cols_; + constexpr auto arr_b = cublasdx::arrangement_of_v_b; + auto d_slice_b_storage = + tutorial::get_empty_device_tensor(shape_b_rows, shape_b_cols, static_slices); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_slice_b = cuda::std::get<1>(d_slice_b_storage); + + + // Create slice tensor C: [m, n, slice_products] - stores int32_t slices of matrix C + auto const [shape_c_rows_, shape_c_cols_] = tensor_c.layout().shape(); + int const shape_c_rows = shape_c_rows_; + int const shape_c_cols = shape_c_cols_; + constexpr auto arr_c = cublasdx::arrangement_of_v_c; + auto d_products_storage = + tutorial::get_empty_device_tensor(shape_c_rows, shape_c_cols, static_slices); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_products = cuda::std::get<1>(d_products_storage); + + + /* ============================================================== */ + /* OZAKI SCHEME STEP 2: PREPROCESSING */ + /* Extract max exponent of rows(A) and cols(B) */ + /* ============================================================== */ + + // The Ozaki scheme requires finding the maximum absolute value in each + // row of A and each column of B to determine appropriate scaling factors. + // These scaling factors ensure that when we slice the double precision + // values into int8_t components, we don't lose significant precision. + + using shift_t = int32_t; + constexpr auto shift_arr = cublasdx::col_major; + + // Create tensors for the shift values with proper tiling structure + auto const static_tile_m = cuda::std::integral_constant> {}; + auto d_shift_a_storage = + tutorial::get_empty_device_tensor(static_tile_m, shape_a_rows / static_tile_m()); + auto tensor_shift_a = cuda::std::get<1>(d_shift_a_storage); + + auto const static_tile_n = cuda::std::integral_constant> {}; + auto d_shift_b_storage = + tutorial::get_empty_device_tensor(static_tile_n, shape_b_cols / static_tile_n()); + auto tensor_shift_b = cuda::std::get<1>(d_shift_b_storage); + + // Execute preprocessing kernels to find maximum values and compute scaling factors + { + auto run_preprocessing = [&](auto str) { + constexpr int reduction_block_size = 64; + // Find max absolute value in each row of A and convert to exponent shift + max_reduce_kernel + <<>>(tensor_a, tensor_shift_a); + // Find max absolute value in each column of B and convert to exponent shift + max_reduce_kernel + <<>>(tensor_b, tensor_shift_b); + }; + + auto time_ms = tutorial::measure::execution(run_preprocessing, warm_up_runs, kernel_runs, stream); + + total_time += time_ms; + if (debug) { + std::cout << "----> Custom Preprocess time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 3: SLICING */ + /* Slice up input A and B matrices */ + /* ============================================================== */ + + // This step decomposes each double precision value into multiple int8_t slices. + // For a double precision value x with scaling factor s, we create slices such that: + // x ≈ Σ(i=0 to slices-1) slice_i * 2^(s - i*8) + // where each slice_i is an int8_t value. + + { + + auto run_slicing = [&](auto str) { + constexpr auto slice_kernel_block_size = 64; + // Slice matrix A: each double precision element becomes slices int8_t values + slice_kernel + <<>>( + tensor_a, tensor_shift_a, tensor_slice_a, shape_a_cols); + // Slice matrix B: each double precision element becomes slices int8_t values + slice_kernel + <<>>( + tensor_b, tensor_shift_b, tensor_slice_b, shape_a_cols); + }; + + auto time_ms = tutorial::measure::execution(run_slicing, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Slice time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 4: MATRIX MULTIPLICATION */ + /* Product of slices */ + /* ============================================================== */ + + // This is the core of the Ozaki scheme. We need to compute the product: + // C = A * B = (Σ A_i * 2^shift_A_i) * (Σ B_j * 2^shift_B_j) + // = ΣΣ A_i * B_j * 2^(shift_A_i + shift_B_j) + // + // We compute this as multiple GEMM operations between slice combinations, + // with each result scaled appropriately and accumulated into the final result. + + { + #include "pipeline_config.hpp.inc" + + dim3 grid(shape_a_rows / static_tile_m(), shape_b_cols / static_tile_n()); + + auto kernel = fused_epilogue_kernel; + + auto shared_memory_size = cublasdx::make_shared_storage_calculator() + .add(device_pipeline.buffer_alignment(), device_pipeline.buffer_size()) + .add(16, sizeof(int32_t), static_tile_m()) // shift_a + .add(16, sizeof(int32_t), static_tile_n()) // shift_b + .get(); + + + CUDA_CHECK_AND_EXIT( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + + auto dummy_c_storage = tutorial::get_copy_tensor(tensor_c); + auto dummy_tensor_c = cuda::std::get<1>(dummy_c_storage); + + auto run_fused_epilogue = [&](auto str) { + kernel<<>>( + device_pipeline, alpha, beta, dummy_tensor_c, tensor_shift_a, tensor_shift_b); + }; + + auto time_ms = tutorial::measure::execution(run_fused_epilogue, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Epilogue time: " << time_ms << " ms" << std::endl; + } + + // Run correctness check + kernel<<>>( + device_pipeline, alpha, beta, tensor_c, tensor_shift_a, tensor_shift_b); + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + std::vector results(tensor_c.size()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(results.data(), + tutorial::raw_pointer_cast(tensor_c.data()), + tensor_c.size() * sizeof(double), + cudaMemcpyDeviceToHost)); + + // performance runs + auto avg_tflops = tutorial::real_gemm_tflops(shape_a_rows, shape_b_cols, shape_a_cols) / total_time; + return cuda::std::make_tuple(total_time, avg_tflops, results); +} + +int main(int argc, char** argv) { + using alpha_value_type = double; + using beta_value_type = double; + + constexpr auto arrangement_a = cublasdx::row_major; + constexpr auto arrangement_b = cublasdx::col_major; + constexpr auto arrangement_c = cublasdx::col_major; + + int const warm_up_runs = 10; + int const kernel_runs = 100; + + #include "parameters.hpp.inc" + + bool const debug = false; + + for (tutorial::gemm_problem_t problem : problems) { + int const m = problem.m; + int const n = problem.n; + int const k = problem.k; + double const alpha = problem.alpha; + double const beta = problem.beta; + + std::cout << "Computing GEMM M=" << m << " N=" << n << " K=" << k << " (slices=" << slices << ")\n"; + + // =================================== + // Ozaki scheme configuration + // =================================== + + #include "cublasdx_config.hpp.inc" + + if (m % tile_m != 0 or n % tile_n != 0 or k % tile_k != 0) { + std::cerr << "Problem shape must be divisible by tile shape" << std::endl; + exit(-1); + } + + // =================================== + // Data type definitions + // =================================== + + using a_value_type = double; + using b_value_type = double; + using c_value_type = double; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + if (debug) { + tutorial::print_device_properties(); + } + + /* ============================================================== */ + /* Input FP64 (host) tensors */ + /* ============================================================== */ + static const float range_lower_bound = 1.0f / 3.14f; + static const float range_upper_bound = 52.0f / 3.14f; + int seed = 1234; + constexpr tutorial::random_distribution dist = tutorial::random_distribution::uniform; + + auto [vector_a, tensor_a] = tutorial::get_random_device_tensor( + m, k, range_lower_bound, range_upper_bound, seed); + auto [vector_b, tensor_b] = tutorial::get_random_device_tensor( + k, n, range_lower_bound, range_upper_bound, seed + 1); + + auto [vector_c_custom, tensor_c_custom] = tutorial::get_random_device_tensor( + m, n, range_lower_bound, range_upper_bound, seed + 2); + auto [vector_c_reference, tensor_c_reference] = tutorial::get_copy_tensor(tensor_c_custom); + + /* ============================================================== */ + /* Compute Reference Result */ + /* ============================================================== */ + auto [time_reference, tflops_reference, results_reference] = tutorial::cublaslt_reference( + alpha, tensor_a, tensor_b, beta, tensor_c_reference, stream, warm_up_runs, kernel_runs); + + + /* ============================================================== */ + /* Compute Emulation Result */ + /* ============================================================== */ + auto [time_tutorial, tflops_tutorial, results_tutorial] = run_tutorial_kernel( + alpha, tensor_a, tensor_b, beta, tensor_c_custom, stream, warm_up_runs, kernel_runs, debug); + + /* ========================================================================================= */ + /* Print summary of performance and correctness results */ + /* ========================================================================================= */ + std::cout << "\nCustom Emulation Kernel (fused)\n"; + std::cout << std::fixed << std::setprecision(4); + std::cout << "Avg time [ms] = " << time_tutorial << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_tutorial << "\n"; + + std::cout << "\ncuBLASLt (not including heuristic)\n"; + std::cout << "Avg time [ms] = " << time_reference << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_reference << "\n\n"; + + constexpr bool verbose_knob = false; + constexpr bool print_knob = true; + + auto error = tutorial::calculate_error(results_tutorial, results_reference, verbose_knob, print_knob); + std::cout << std::fixed << std::setprecision(10) << "Total relative error = " << error << "\n"; + + std::cout << std::fixed << std::setprecision(2) << (tflops_tutorial / tflops_reference) * 100 + << "% reference performance \n\n"; + } + + return 0; +} diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2c_fully_fused_emulation/emulation_kernels.hpp b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2c_fully_fused_emulation/emulation_kernels.hpp new file mode 100644 index 00000000..bb0aa062 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2c_fully_fused_emulation/emulation_kernels.hpp @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include + +#include +using namespace cublasdx; + +#include + +#include +#include "slicing.hpp" + +enum class slice_matrix +{ + a, + b +}; + +template +__launch_bounds__(BlockSize, 2) __global__ void max_reduce_kernel(InTensor in_tensor, OutTensor out_tensor) { + using datatype = tutorial::tensor_value_type_t; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + const auto [tile_size_x, tile_size_y] = in_tensor.layout().shape(); + auto tid = threadIdx.x; + auto bid = blockIdx.x; + + // Assume that tensor is reduced along the last dimension + auto const row_index = tutorial::conditional_return(bid, cublasdx::slice); + auto const col_index = tutorial::conditional_return(cublasdx::slice, bid); + + auto global_tile = in_tensor(row_index, col_index); + + // 1. Find local maximum absolute value for this thread + double local_max = 0; + + auto const length = (SliceMatrix == slice_matrix::a) ? tile_size_y : tile_size_x; + for (auto i = tid; i < length; i += BlockSize) { + local_max = cuda::std::max(local_max, cuda::std::abs(global_tile(i))); + } + + // 2. Compute block-wide reduction to find maximum across all threads + __syncthreads(); + const double block_max = BlockReduce(temp_storage).Reduce(local_max, [](const auto& a, const auto& b) { + return cuda::std::max(a, b); + }); + + // 3. Convert maximum value to exponent shift and store to global memory + // This shift determines the scaling factor for slicing this row/column + if (tid == 0) { + out_tensor(bid) = max_to_exponent_shift(block_max); + } +} + + +template +__launch_bounds__(BlockSize, 2) __global__ + void slice_kernel(InTensor in_tensor, ShiftTensor shift_tensor, OutTensor out_tensor, int32_t reduction_dim_size) { + using in_datatype = tutorial::tensor_value_type_t; + using out_datatype = tutorial::tensor_value_type_t; + + const auto tid = threadIdx.x + blockIdx.x * BlockSize; + + // Calculate which matrix element this thread processes + auto slow_idx = tid / reduction_dim_size; + auto fast_idx = tid % reduction_dim_size; + + auto const row_idx = (SliceMatrix == slice_matrix::a) ? slow_idx : fast_idx; + auto const col_idx = (SliceMatrix == slice_matrix::a) ? fast_idx : slow_idx; + + // Decompose the double precision value into multiple int8_t slices + // using the appropriate scaling factor for this row/column + const cuda::std::array slices = + slices_from_fp64(in_tensor(row_idx, col_idx), shift_tensor(slow_idx)); + +// Store all slices for this matrix element +#pragma unroll + for (int elem = 0; elem < Slices; ++elem) { + out_tensor(row_idx, col_idx, elem) = slices[elem]; + } +} + +#include "fused_kernel.hpp.inc" + diff --git a/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2c_fully_fused_emulation/slicing.hpp b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2c_fully_fused_emulation/slicing.hpp new file mode 100644 index 00000000..c39cbfcd --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/03-Ozaki-I-Emulation/cpp/2c_fully_fused_emulation/slicing.hpp @@ -0,0 +1,221 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#define SLICING_FUNCTION __host__ __device__ __forceinline__ + +union double_structure { + double d; + struct float64 { + unsigned int mantissa_lo : 32; + unsigned int mantissa_hi : 20; + unsigned int exponent : 11; + unsigned int sign : 1; + } s; +}; + +static constexpr int bias = 1023; + +/* + * Signed magnitudes of length N only allow for (N-1) of effective storage. + */ +template +SLICING_FUNCTION constexpr int get_width() { + if constexpr (cuda::std::is_signed()) { + return 8 * sizeof(T) - 1; + } else { + return 8 * sizeof(T); + }; +} + +SLICING_FUNCTION int64_t div_up(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + +SLICING_FUNCTION int rz_width(const double_structure& em) { + return em.s.exponent + 1 - bias; +} +// Length of a bits before the decimal point. i.e., bit width if casted to infinite-length int type. +SLICING_FUNCTION int rz_width(const double d) { + double_structure em {d}; + return rz_width(em); +} + +SLICING_FUNCTION constexpr int64_t ipow_p(int64_t base, int exp, int64_t ans = 1) { + return exp < 1 ? ans : ipow_p(base * base, exp / 2, (exp % 2) ? ans * base : ans); +} + +SLICING_FUNCTION constexpr double ipow(int base, int exp) { + return exp > 0 ? ipow_p(base, exp) : 1.0 / ipow_p(base, -exp); +} + +template +SLICING_FUNCTION constexpr int max_exponent(); + +// scale to numbers no bigger than 256 +template<> +SLICING_FUNCTION constexpr int max_exponent() { + return 8; +} + +// scale to numbers no bigger than 128 +template<> +SLICING_FUNCTION constexpr int max_exponent() { + return 7; +} + +SLICING_FUNCTION int32_t get_exponent(double val) { + double_structure em = {val}; + + int em_exponent = (em.s.exponent + 1 - bias); + + if (em.s.mantissa_hi & (63 << 14) == (63 << 14)) + em_exponent++; + + return em_exponent; +} + +// An implementation of ldexp() to be used in scaling double-precision numbers obtained from unpacking slices +// in the epilogue. The resulting double values must be finite and normalized, and so the fast path should +// simply adjust the exponent field of the value, so long as the result is also finite and normalized. +SLICING_FUNCTION void epilogue_ldexp(double_structure& em, int exp) { + static constexpr int exp_max = bias - 1; + int previous_exp_biased = static_cast(em.s.exponent); + if (0 < previous_exp_biased && 0 < previous_exp_biased + exp && previous_exp_biased + exp <= exp_max + bias) { + em.s.exponent += exp; + return; + } + em.d = ldexp(em.d, exp); +} + +/* + * Returns the exponent shift to be applied to a row/colum + * based on the max(abs()) on that row/column. + * Naively, this scaling factor would be just the exponent + * of max(abs()) but we do some other tricks to account for + * the encoding of the signed magnitude only on the leading + * slice among other things.. + */ +SLICING_FUNCTION int32_t max_to_exponent_shift(double row_col_max) { + static constexpr int scale_max_exponent = max_exponent(); + + return scale_max_exponent - get_exponent(row_col_max); +} + +/* + * slices up the input value "val" in "nslices" of type "SliceValueType". + * Before slicing the number, the exponent of "val" is shifted based on + * the value of "exponent_shift" which has been computed using the + * max_to_exponent_shift function based on the max(abs()) of the relevant + * row/column of A/B. + * + * On exit, the first value of the returned array contains the most + * significant slice. + */ +template +SLICING_FUNCTION cuda::std::array slices_from_fp64(double val, int32_t exponent_shift) { + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + + cuda::std::array slices = {0}; + + static constexpr double normalization_factor = 0x1.0p52; + // Normalise denormalised numbers, but store the effective exponent in its own variable, + // allowing for representation of fp64 denorms as normalised numbers. + + int skip_slices = 0; + int64_t r = 0; + + uint8_t reg_pack = 0; + + double_structure r0 = {val}; + int denorm_compensation = 0; + if (r0.s.exponent == 0) { + if (r0.d == 0.0) { + skip_slices = nslices; + r = 0; + } else { + /* round to nearest is the default behavior on CPU */ + r0.d = (r0.d * normalization_factor); + denorm_compensation = -52; + } + } + int exp = r0.s.exponent + exponent_shift + denorm_compensation - bias; + exp += (nslices - 1) * get_width(); // Use all 8 bits. + + // Adjust casting range. + int extra_width = (exp + 1) - 63; + extra_width = extra_width > 0 ? extra_width : 0; + skip_slices = div_up(extra_width, get_width()); + exp -= skip_slices * get_width(); + + // Handle exp outside of double range + if (exp < 0) { + r = 0; + } else { + r0.s.exponent = (unsigned int)(exp + bias); + r = static_cast(r0.d); + } + + for (int64_t _i = 0; _i < nslices; _i++) { + int64_t i = nslices - 1 - _i; + + if (_i < skip_slices) { + reg_pack = 0; + } else { + reg_pack = static_cast(r); + slices[i] = static_cast(reg_pack); + r = (r >> get_width()) + (reg_pack >> get_width()); + } + } + + return slices; +} + +/* + * This function is a building block to reconstruct an FP64 number from the slices. + * Instead of receiving the set of slices and adding them to an FP64 number, + * this function gets a single slice (the NTH-slice) and returns it as an FP64 value. + * + * In this way, one could use this function to compute and accumulate the contributions + * from the slices separately. + * + * Remark: when reconstructing an FP64 number accumulate the least significant + * diagonals first to avoid catastrophic cancellation. + */ +template +SLICING_FUNCTION double nth_slice_to_fp64(int32_t nth, DiagonalAccType nth_slice, int32_t exponent_shift) { + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + assert(nth >= 0); + + /* In some instances, we use the unsigned value type to leverage all bits for storage */ + double ko = pow(2.0, -get_width>() * nth); + + double value_i = ko * static_cast(nth_slice); + double_structure value = {value_i}; + epilogue_ldexp(value, -exponent_shift); + return value.d; +} diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/04.01-SYRK.ipynb b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/04.01-SYRK.ipynb new file mode 100644 index 00000000..562cb8f8 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/04.01-SYRK.ipynb @@ -0,0 +1,855 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "24069346-2c9c-4d6b-b812-d59f2804e36c", + "metadata": {}, + "source": [ + "## Challenge Exercise 4.1: SYRK Emulation\n", + "\n", + "Now that we understand how to build a performant emulated GEMM kernel, we can start to think about how we can apply the same algorithm and techniques to other routines.\n", + "\n", + "A very closely related routine is the symmetric rank-k update, often referred to as SYRK. For a given $n \\times k$ matrix $\\mathbf{A}$, we can compute an $n \\times n$ matrix $\\mathbf{C}$ at row $i$, column $j$ as follows:\n", + "\n", + "$$\n", + "\\mathbf{C}_{i, j} = \\alpha \\sum_{l=0}^{k}\\left( \\mathbf{A}_{i, l} \\mathbf{A}^{T}_{l, j} \\right) + \\beta \\mathbf{C}_{i, j}\n", + "$$\n", + "\n", + "You may notice that this follows the same definition as GEMM, except that we are multiplying $\\mathbf{A}$ with itself. This allows the output matrix to be symmetric (i.e. $\\mathbf{C} = \\mathbf{C}^{T}$)\n", + "\n", + "That matrix property along with the knowledge that we are multiply $A$ by itself allows us to make more problem specific optimizations to reduce the amount of math operations and access memory more efficiently." + ] + }, + { + "cell_type": "markdown", + "id": "a0db64c8-86ef-4756-930d-bcd78b776b93", + "metadata": {}, + "source": [ + "### C++ CMake Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0be4d4b5-ec12-4a3c-a0b2-30b3bf786314", + "metadata": {}, + "outputs": [], + "source": [ + "import sys, os\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "from common_cuda import setup_cmake_project\n", + "setup_cmake_project()" + ] + }, + { + "cell_type": "markdown", + "id": "94855e83-341a-4b13-83ea-273c6e7af279", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "### Python Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23406d00-44c5-4f62-b9d3-82dd1ba25817", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "import numpy as np\n", + "import cupy as cp\n", + "import nvmath\n", + "\n", + "from nvmath.device import Matmul\n", + "from nvmath.device.cublasdx import DevicePipeline, SharedStorageCalc\n", + "from nvmath.device.cublasdx_numba import pipeline_extensions\n", + "from nvmath.device.common import axpby, clear, copy, copy_fragment, copy_wait, make_tensor\n", + "from numba import cuda\n", + "\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "\n", + "from benchmark import *" + ] + }, + { + "cell_type": "markdown", + "id": "2cf99cb9-bfa3-4c5e-bfcf-cbae48061443", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "### C++" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddac7a49-07ab-4337-979c-aa9e121902f0", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/3a_fused_syrk_emulation/parameters.hpp.inc\n", + "\n", + " int const warm_up_runs = 10;\n", + " int const kernel_runs = 100;\n", + "\n", + " // ===================================\n", + " // Problem configuration\n", + " // ===================================\n", + "\n", + " // (syrk_n, syrk_k, alpha, beta, uplo)\n", + " std::vector problems = {\n", + " {8192, 8192, 0.9, 1.1, tutorial::matrix_half::upper},\n", + " {8192, 8192, 0.9, 1.1, tutorial::matrix_half::lower}\n", + " };\n", + " \n", + "\n", + " // ===================================\n", + " // Global SYRK configuration\n", + " // ===================================\n", + "\n", + " // The number of slices used in emulation algorithm\n", + " // More slices = higher precision but more computation\n", + " constexpr unsigned slices = 7;\n", + "\n", + " bool const debug = false;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9545479-9d2f-4130-b410-173f9cf87943", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/3a_fused_syrk_emulation/cublasdx_config.hpp.inc\n", + "\n", + " using slice_value_type = int8_t; // Precision for individual slices\n", + " using accumulator_value_type = int32_t; // Precision for accumulation\n", + "\n", + " // The shape of data tile processed by a single CTA block\n", + " constexpr int tile_m = 128;\n", + " constexpr int tile_n = 128;\n", + " constexpr int tile_k = 128;\n", + "\n", + " // The shape of CTA block (number of threads)\n", + " constexpr int cta_shape_x = 128;\n", + " constexpr int cta_shape_y = 1;\n", + " constexpr int cta_shape_z = 1;\n", + "\n", + " using BLAS = decltype(cublasdx::Size() +\n", + " cublasdx::Precision() +\n", + " cublasdx::Type() + cublasdx::Function() +\n", + " cublasdx::Arrangement() + cublasdx::Block() +\n", + " cublasdx::BlockDim() + cublasdx::StaticBlockDim() +\n", + " cublasdx::MaxAlignment() + cublasdx::EnableInputStreaming() + cublasdx::WithPipeline() +\n", + " cublasdx::SM());" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a0f84-9477-4294-b59b-f4d806411cba", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/3a_fused_syrk_emulation/pipeline_config.hpp.inc\n", + "\n", + " constexpr unsigned pipeline_depth = 3;\n", + "\n", + " auto device_pipeline = cublasdx::suggest_device_pipeline(\n", + " tensor_slice_a, tensor_slice_at)\n", + " .value();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "632d63bf-c79e-4f0b-ab54-75e574518fc9", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/3a_fused_syrk_emulation/kernel_config.hpp.inc\n", + "\n", + " dim3 grid(shape_a_rows / static_tile_m(), shape_a_rows / static_tile_n());\n", + " auto kernel = fused_epilogue_kernel;\n", + "\n", + " auto shared_memory_size = cublasdx::make_shared_storage_calculator()\n", + " .add(device_pipeline.buffer_alignment(), device_pipeline.buffer_size())\n", + " .add(16, sizeof(int32_t), static_tile_m()) // shift_a\n", + " .add(16, sizeof(int32_t), static_tile_n()) // shift_b\n", + " .get();\n", + "\n", + " CUDA_CHECK_AND_EXIT(\n", + " cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1013bf66-8ad0-441d-b86e-694588047e84", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/3a_fused_syrk_emulation/fused_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__\n", + " void fused_epilogue_kernel(__grid_constant__ DevicePipeline const device_pipeline,\n", + " Alpha alpha,\n", + " Beta beta,\n", + " CTensor gmem_c_fp64,\n", + " tutorial::matrix_half output_half,\n", + " AShiftTensor const gmem_shift_a,\n", + " AtShiftTensor const gmem_shift_at) {\n", + " extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[];\n", + "#ifdef __CUDA_ARCH__\n", + " if constexpr (cublasdx::sm_of_v == __CUDA_ARCH__) {\n", + " // CHALLENGE EXERCISE --> Implement a fused SYRK\n", + " // HINT --> Start with emulated GEMM kernel from 3.3\n", + " }\n", + "#endif\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aabebb9-0b1f-49f1-9675-5649b7219ddc", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/3a_fused_syrk_emulation/kernel_launch.hpp.inc\n", + "\n", + " kernel<<>>(\n", + " device_pipeline, alpha, beta, tensor_c, output_half, tensor_shift_a, tensor_shift_at);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c94a5e0-4f37-4a41-9524-31bf6f593cc4", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build ./build -t 3a_fused_syrk_emulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29399cdc-c197-4e78-9e05-f2b9833dffc9", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!./build/3a_fused_syrk_emulation" + ] + }, + { + "cell_type": "markdown", + "id": "06cd736c-ed7b-4319-acd5-4d5c6d10ea37", + "metadata": {}, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "markdown", + "id": "4853a1a3-c227-4811-b23d-b36f4cc3969f", + "metadata": {}, + "source": [ + "We will rewrite kernel now and recompile the solution. If you want to restart your exercise make sure you rewrite kernel back and recompile it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59880a32-4f56-4b34-9cf5-5289e9d85cb2", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile cpp/3a_fused_syrk_emulation/fused_kernel.hpp.inc\n", + "\n", + "template\n", + "__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__\n", + " void fused_epilogue_kernel(__grid_constant__ DevicePipeline const device_pipeline,\n", + " Alpha alpha,\n", + " Beta beta,\n", + " CTensor gmem_c_fp64,\n", + " tutorial::matrix_half output_half,\n", + " AShiftTensor const gmem_shift_a,\n", + " AtShiftTensor const gmem_shift_at) {\n", + " extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[];\n", + "#ifdef __CUDA_ARCH__\n", + " if constexpr (cublasdx::sm_of_v == __CUDA_ARCH__) {\n", + " // ================================\n", + " // 1. SETUP AND TILE PREPARATION\n", + " // ================================\n", + "\n", + " constexpr int tile_m = cublasdx::size_of_v_m;\n", + " constexpr int tile_n = cublasdx::size_of_v_n;\n", + "\n", + " auto const block_offset_m = blockIdx.x * tile_m;\n", + " auto const block_offset_n = blockIdx.y * tile_n;\n", + "\n", + " if ((block_offset_n > (block_offset_m + tile_m) and output_half == tutorial::matrix_half::lower) or\n", + " (block_offset_m > (block_offset_n + tile_n) and output_half == tutorial::matrix_half::upper)) {\n", + " return;\n", + " }\n", + "\n", + " constexpr auto initial_diag = Slices - 1;\n", + " constexpr auto initial_term = 0;\n", + "\n", + " auto [pipeline_smem, smem_shift_a, smem_shift_at] =\n", + " cublasdx::shared_memory::slice(smem,\n", + " device_pipeline.buffer_alignment(),\n", + " device_pipeline.buffer_size(),\n", + " cublasdx::alignment_of_v_a,\n", + " cute::make_layout(cute::Int()),\n", + " cublasdx::alignment_of_v_b,\n", + " cute::make_layout(cute::Int()));\n", + "\n", + " // Copy general purpose data\n", + " cublasdx::copy(gmem_shift_a(cute::_, blockIdx.x), smem_shift_a);\n", + " cublasdx::copy(gmem_shift_at(cute::_, blockIdx.y), smem_shift_at);\n", + " cublasdx::copy_wait();\n", + "\n", + "\n", + " // Get pipeline tile\n", + " auto tile_pipeline = device_pipeline.get_tile(pipeline_smem,\n", + " cublasdx::make_coord(blockIdx.x, initial_term),\n", + " cublasdx::make_coord(blockIdx.y, initial_diag));\n", + "\n", + " auto accumulator = tile_pipeline.get_accumulator();\n", + "\n", + " // ================================\n", + " // 2. FP64 C INPUT / OUTPUT TILE SETUP\n", + " // ================================\n", + "\n", + " auto tile_c_fp64_gmem = cublasdx::get_tile(gmem_c_fp64, BLAS::c_shape, blockIdx.x, blockIdx.y);\n", + "\n", + " // ============================================\n", + " // 3. OZAKI SCHEME DIAGONAL ITERATION\n", + " // ============================================\n", + "\n", + " // Iterate over diagonals in reverse order (highest power of 2 first)\n", + " // This ensures proper accumulation order for numerical stability\n", + "# pragma unroll 1\n", + " for (auto diag = initial_diag; diag >= 0; --diag) {\n", + "\n", + " // Initialize accumulator for this diagonal\n", + " accumulator.clear();\n", + "\n", + " // ==========================================\n", + " // 4. SLICE COMBINATION COMPUTATION\n", + " // ==========================================\n", + "\n", + " // Compute all slice combinations that contribute to this diagonal\n", + " // For diagonal d, we compute: A_slice[i] * B_slice[d-i] for i = 0 to d\n", + "# pragma unroll 1\n", + " for (auto term = initial_term; term <= diag; ++term) {\n", + " // =========================================\n", + " // 5. N-STAGE MEMORY PIPELINE FOR GEMM\n", + " // =========================================\n", + "\n", + " tile_pipeline.execute(accumulator);\n", + "\n", + " const auto next_slice_row = (term == diag) ? 0 : term + 1; // A slice index\n", + " const auto next_slice_col = (term == diag) ? (diag - 1) : (diag - next_slice_row); // B slice index\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " cublasdx::make_coord(blockIdx.x, next_slice_row),\n", + " cublasdx::make_coord(blockIdx.y, next_slice_col));\n", + " } /* end of slice combination loop */\n", + "\n", + " // ========================================\n", + " // 6. RESULT RECONSTRUCTION AND EPILOGUE\n", + " // ========================================\n", + "\n", + "\n", + " // Load existing C values\n", + " auto d_fp64_frag = accumulator.make_partition_and_copy(tile_c_fp64_gmem);\n", + "\n", + " if(accumulator.is_thread_active()) {\n", + " // Convert accumulated int32_t results back to double precision\n", + " // and apply appropriate scaling based on slice positions\n", + " auto gemm_results = accumulator.get_results();\n", + "\n", + " // Process each element in the register fragment\n", + "# pragma unroll\n", + " for (int i = 0; i < cublasdx::size(d_fp64_frag); ++i) {\n", + " auto const [global_x, global_y] = accumulator.map_fragment_index(i);\n", + " auto const shift_a_elem = smem_shift_a(global_x);\n", + " auto const shift_at_elem = smem_shift_at(global_y);\n", + "\n", + " int const total_x = block_offset_m + global_x;\n", + " int const total_y = block_offset_n + global_y;\n", + " bool const is_in_bounds = (output_half == tutorial::matrix_half::lower and (total_x >= total_y)) or\n", + " (output_half == tutorial::matrix_half::upper and (total_y >= total_x));\n", + "\n", + " // Convert int32_t slice result back to double precision\n", + " // with appropriate scaling for this diagonal and element\n", + " double const val = nth_slice_to_fp64(diag, gemm_results(i), shift_a_elem + shift_at_elem);\n", + " d_fp64_frag(i) = is_in_bounds ? (alpha * val + beta * d_fp64_frag(i)) : d_fp64_frag(i);\n", + " }\n", + " }\n", + "\n", + " accumulator.partition_and_copy(d_fp64_frag, tile_c_fp64_gmem);\n", + " beta = 1.0;\n", + " }\n", + " }\n", + "#endif\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca293b12-c518-462a-91a4-0bd1965800e0", + "metadata": {}, + "outputs": [], + "source": [ + "!cmake --build ./build -t 3a_fused_syrk_emulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b4fc7cf-8368-4fb7-8553-e0a0adb11dc4", + "metadata": {}, + "outputs": [], + "source": [ + "!./build/3a_fused_syrk_emulation" + ] + }, + { + "cell_type": "markdown", + "id": "e79313d8-adca-4056-8f8c-f9612f08a104", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "### Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf954468-ad39-45ea-985d-63cb990ae3cc", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "import numpy as np\n", + "import cupy as cp\n", + "import nvmath\n", + "import math\n", + "\n", + "from nvmath.device import Matmul\n", + "from nvmath.device.cublasdx import DevicePipeline, SharedStorageCalc\n", + "from nvmath.device.cublasdx_numba import pipeline_extensions\n", + "from nvmath.device.common import axpby, clear, copy, copy_fragment, copy_wait, make_tensor, make_fragment_like\n", + "from numba import cuda\n", + "\n", + "sys.path.append(os.sep.join([\"..\", \"utilities\", \"python\"]))\n", + "\n", + "from benchmark import *\n", + "from emulation_utils import get_width, epilogue_ldexp, MatrixHalf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37722a1c-a7b1-4a3b-b191-8158644b2cc7", + "metadata": {}, + "outputs": [], + "source": [ + "problems = [\n", + " (8192, 8192, 0.9, 1.1, 'L'),\n", + " (8192, 8192, 0.9, 1.1, 'U'),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddcb5d86-e55b-4f2d-a423-4abe53e38095", + "metadata": {}, + "outputs": [], + "source": [ + "def get_emulated_dsyrk_kernel(BLAS, matrix_half):\n", + " \n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " TILE_M, TILE_N = BLAS.c_dim\n", + " TILE_K = BLAS.a_dim[1]\n", + " BLOCK_SIZE = BLAS.block_size\n", + " ALIGNMENT = min(BLAS.alignment.a, min(BLAS.alignment.b, BLAS.alignment.c))\n", + "\n", + " uint8_width = get_width(np.uint8)\n", + "\n", + " assert TILE_M == TILE_N, \"Invalid SYRK configuration\"\n", + " is_lower = (matrix_half == MatrixHalf.lower)\n", + " \n", + " @cuda.jit(device=True, forceinline=True)\n", + " def nth_slice_to_fp64(nth, nth_slice, exponent_shift):\n", + " ko = math.pow(2.0, -nth * uint8_width)\n", + "\n", + " value = ko * np.float64(nth_slice)\n", + " return epilogue_ldexp(value, -exponent_shift)\n", + " \n", + " @cuda.jit(extensions=pipeline_extensions, launch_bounds=(BLOCK_SIZE, 1))\n", + " def dsyrk_kernel(slices, shift_a_tensor, alpha, beta, tensor_c, device_pipeline: DevicePipeline):\n", + " # CHALLENGE EXERCISE --> Implement a fused SYRK kernel\n", + " pass\n", + "\n", + " return dsyrk_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9522600b-2efa-446e-8dad-17e3fe57ce36", + "metadata": {}, + "outputs": [], + "source": [ + "def fused_dsyrk_ozaki(tensor_slicedA_cupy, tensor_c_cupy, tensor_shift_a_cupy, alpha, beta, context, warmup=True):\n", + " m, n = tensor_c_cupy.shape\n", + " _, k, slices = tensor_slicedA_cupy.shape\n", + "\n", + " BLAS = context[\"BLAS\"]\n", + " PIPELINE_DEPTH = context[\"PIPELINE_DEPTH\"]\n", + " syrk_kernel = context[\"syrk_kernel\"]\n", + " grid = context[\"syrk_grid\"]\n", + " block = context[\"syrk_block\"]\n", + "\n", + " _, TILE_N = BLAS.c_dim\n", + "\n", + " # Create transposed view of A for A^T\n", + " # Swap the shape and strides for the first two dimensions\n", + " stride_n, stride_k, stride_slices = tensor_slicedA_cupy.strides\n", + " tensor_slicedAT_cupy = cp.ndarray(\n", + " shape=(k, m, slices),\n", + " dtype=np.int8,\n", + " memptr=tensor_slicedA_cupy.data,\n", + " strides=(stride_k, stride_n, stride_slices)\n", + " )\n", + "\n", + " tensor_slicedA = cuda.as_cuda_array(tensor_slicedA_cupy)\n", + " tensor_slicedAT = cuda.as_cuda_array(tensor_slicedAT_cupy)\n", + " tensor_shift_a = cuda.as_cuda_array(tensor_shift_a_cupy)\n", + " tensor_c = cuda.as_cuda_array(tensor_c_cupy)\n", + "\n", + " device_pipeline = BLAS.suggest_device_pipeline(PIPELINE_DEPTH, tensor_slicedA, tensor_slicedAT)\n", + "\n", + " smem_size = device_pipeline.buffer_size + (TILE_N + TILE_N) * np.dtype(np.int32).itemsize\n", + " \n", + " if warmup:\n", + " set_max_dynamic_shared_size_bytes(syrk_kernel, smem_size,\n", + " slices, tensor_shift_a, alpha, beta, tensor_c, device_pipeline)\n", + " syrk_kernel[grid, block, 0, smem_size](slices, tensor_shift_a, alpha, beta, tensor_c, device_pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3258a169-e9bd-4f22-8017-ae643ff70aeb", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_func(n, k, matrix_half):\n", + " TILE_N = 128\n", + " TILE_K = 128\n", + " PIPELINE_DEPTH = 3\n", + " BLOCK_SIZE = 128\n", + " ALIGNMENT = 16\n", + " DATA_TYPE = \"real\"\n", + "\n", + " assert n % TILE_N == 0, \"Unsupported dimension n for TILE_N\"\n", + " assert k % TILE_K == 0, \"Unsupported dimension n for TILE_N\"\n", + " assert k >= (TILE_K * PIPELINE_DEPTH), \"Unsupported pipeline depth for k\"\n", + " \n", + " BLAS = Matmul(size=(TILE_N, TILE_N, TILE_K),\n", + " precision=(np.int8, np.int8, np.int32),\n", + " data_type=DATA_TYPE,\n", + " alignment=(ALIGNMENT, ALIGNMENT, ALIGNMENT),\n", + " arrangement=(\"row_major\", \"col_major\", \"col_major\"), # Do not change\n", + " execution=\"Block\",\n", + " block_size=BLOCK_SIZE,\n", + " with_pipeline=True,\n", + " enable_input_streaming=True,\n", + " static_block_dim=True)\n", + "\n", + " syrk_grid = (n // TILE_N, n // TILE_N)\n", + " syrk_block = BLAS.block_dim\n", + "\n", + " return {\n", + " \"BLAS\": BLAS,\n", + " \"PIPELINE_DEPTH\": PIPELINE_DEPTH,\n", + " \"syrk_kernel\" : get_emulated_dsyrk_kernel(BLAS, matrix_half),\n", + " \"syrk_grid\": syrk_grid,\n", + " \"syrk_block\": syrk_block,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3538a57d-b194-4404-a560-6e6ea4f9f2be", + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_fused_emulated_dsyrk(problems, setup_func, fused_dsyrk_ozaki)" + ] + }, + { + "cell_type": "markdown", + "id": "81c44ea6-70f5-43ba-92a1-88eaf4af5211", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "#### Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "819ed4a2-07ed-4423-b91b-6be2ec80f725", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "def get_emulated_dsyrk_kernel_solution(BLAS, matrix_half):\n", + "\n", + " assert BLAS.a_value_type == BLAS.b_value_type, \"Invalid BLAS configuration\"\n", + "\n", + " A_SIZE = BLAS.suggest_layout_smem_a().cosize\n", + " B_SIZE = BLAS.suggest_layout_smem_b().cosize\n", + " C_SIZE = BLAS.suggest_layout_rmem_c().cosize\n", + "\n", + " TILE_M, TILE_N = BLAS.c_dim\n", + " TILE_K = BLAS.a_dim[1]\n", + " BLOCK_SIZE = BLAS.block_size\n", + " ALIGNMENT = min(BLAS.alignment.a, min(BLAS.alignment.b, BLAS.alignment.c))\n", + "\n", + " uint8_width = get_width(np.uint8)\n", + "\n", + " assert TILE_M == TILE_N, \"Invalid SYRK configuration\"\n", + " is_lower = (matrix_half == MatrixHalf.lower)\n", + " \n", + " @cuda.jit(device=True, forceinline=True)\n", + " def nth_slice_to_fp64(nth, nth_slice, exponent_shift):\n", + " ko = math.pow(2.0, -nth * uint8_width)\n", + "\n", + " value = ko * np.float64(nth_slice)\n", + " return epilogue_ldexp(value, -exponent_shift)\n", + " \n", + " @cuda.jit(extensions=pipeline_extensions, launch_bounds=(BLOCK_SIZE, 1))\n", + " def dsyrk_kernel(slices, shift_a_tensor, alpha, beta, tensor_c, device_pipeline: DevicePipeline):\n", + " m, n = tensor_c.shape\n", + "\n", + " block_m = cuda.blockIdx.x\n", + " block_n = cuda.blockIdx.y\n", + "\n", + " smem_pipeline = cuda.shared.array(shape=(0,), dtype=BLAS.a_value_type, alignment=ALIGNMENT)\n", + " \n", + " smem_shift_a = cuda.shared.array(shape=(TILE_M), dtype=np.int32)\n", + " smem_shift_at = cuda.shared.array(shape=(TILE_N), dtype=np.int32)\n", + "\n", + " block_start_m = block_m * TILE_M\n", + " block_end_m = (block_m + 1) * TILE_M\n", + "\n", + " block_start_n = block_n * TILE_N\n", + " block_end_n = (block_n + 1) * TILE_N\n", + "\n", + " # Skip blocks outside the triangular region\n", + " if is_lower:\n", + " # Lower triangular: skip if block_n > block_m\n", + " if block_n > block_m:\n", + " return\n", + " else:\n", + " # Upper triangular: skip if block_m > block_n\n", + " if block_m > block_n:\n", + " return\n", + " \n", + " if block_start_m >= m or block_start_n >= n:\n", + " return\n", + "\n", + " shift_a_view = shift_a_tensor[block_start_m : block_end_m]\n", + " shift_at_view = shift_a_tensor[block_start_n : block_end_n]\n", + "\n", + " tid = cuda.threadIdx.x\n", + " if tid < TILE_M:\n", + " smem_shift_a[tid] = shift_a_view[tid]\n", + " if tid < TILE_N:\n", + " smem_shift_at[tid] = shift_at_view[tid]\n", + " cuda.syncthreads()\n", + "\n", + " c_view = tensor_c[\n", + " block_start_m : block_end_m,\n", + " block_start_n : block_end_n,\n", + " ]\n", + "\n", + " ldc = max(c_view.strides) // c_view.itemsize\n", + " gmem_c = make_tensor(c_view, BLAS.get_layout_gmem_c(ldc))\n", + " \n", + " initial_diag = slices - 1\n", + " initial_term = 0\n", + "\n", + " tile_pipeline = device_pipeline.get_tile(smem_pipeline,\n", + " (block_m, np.int32(initial_term)),\n", + " (block_n, np.int32(initial_diag)))\n", + " \n", + " accumulator = BLAS.suggest_accumulator()\n", + " beta_used = beta\n", + " for diag in range(initial_diag, -1, -1):\n", + " \n", + " accumulator.clear()\n", + " for term in range(initial_term, diag + 1):\n", + " tile_pipeline.execute(accumulator)\n", + "\n", + " next_slice_row = 0 if term == diag else term + 1\n", + " next_slice_col = (diag - 1) if term == diag else diag - next_slice_row\n", + "\n", + " device_pipeline.reset_tile(tile_pipeline,\n", + " (block_m, np.int32(next_slice_row)),\n", + " (block_n, np.int32(next_slice_col)))\n", + "\n", + " if accumulator.is_thread_active():\n", + " gemm_results = accumulator.get_results()\n", + " c_fp64_frag = make_fragment_like(gemm_results, np.float64)\n", + " copy_fragment(gmem_c, c_fp64_frag)\n", + "\n", + " for i in range(c_fp64_frag.layout.size):\n", + " (global_x, global_y) = accumulator.map_fragment_index(i)\n", + " shift_a = smem_shift_a[global_x]\n", + " shift_at = smem_shift_at[global_y]\n", + "\n", + " syrk_m = block_start_m + global_x\n", + " syrk_n = block_start_n + global_y\n", + " if is_lower:\n", + " in_bounds = (syrk_m >= syrk_n)\n", + " else:\n", + " in_bounds = (syrk_m <= syrk_n)\n", + "\n", + " value = alpha * nth_slice_to_fp64(diag, gemm_results[i], shift_a + shift_at)\n", + " c_fp64_frag[i] = value + beta_used * c_fp64_frag[i] if in_bounds else c_fp64_frag[i]\n", + "\n", + " accumulator.partition_and_copy(c_fp64_frag, gmem_c)\n", + "\n", + " beta_used = 1.0\n", + "\n", + " tile_pipeline._del()\n", + "\n", + " return dsyrk_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc99c07-5ced-493d-9d76-679ee5e39a99", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "def setup_func_solution(n, k, matrix_half):\n", + " ctx = setup_func(n, k, matrix_half)\n", + " BLAS = ctx[\"BLAS\"]\n", + " ctx[\"syrk_kernel\"] = get_emulated_dsyrk_kernel_solution(BLAS, matrix_half)\n", + "\n", + " return ctx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a80aa71f-a5f6-4541-85d2-e539865f7fe0", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "benchmark_fused_emulated_dsyrk(problems, setup_func_solution, fused_dsyrk_ozaki)" + ] + }, + { + "cell_type": "markdown", + "id": "446bb3ae-e948-4833-b8b0-eb07835f6ba3", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "c4a795c2-fcd8-4b45-baa5-880de4a4ceee", + "metadata": {}, + "source": [ + "In this chapter you have customized the kernel from 3.3 even further to accelerate a different algorithm, using the underlying flexibility of writing custom kernels with libraries only as building blocks." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/CMakeLists.txt b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/CMakeLists.txt new file mode 100644 index 00000000..7fa97771 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 4.0) + +LIST(APPEND CMAKE_PROGRAM_PATH "/usr/local/cuda-13.1/bin") +project(cublasdx-dgemm-notebook-3 VERSION 0.1 LANGUAGES CUDA CXX) + +# Add header tutorial helper files +add_library(tutorial_helpers INTERFACE) +target_include_directories(tutorial_helpers INTERFACE ../../cpp_source/include/) + +include(../../cmake/tutorial.cmake) + +add_tutorial(3a_fused_syrk_emulation cpp/3a_fused_syrk_emulation/syrk_emulation.cu) diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/cublasdx_config.hpp.inc b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/cublasdx_config.hpp.inc new file mode 100644 index 00000000..f2a305a2 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/cublasdx_config.hpp.inc @@ -0,0 +1,21 @@ + + using slice_value_type = int8_t; // Precision for individual slices + using accumulator_value_type = int32_t; // Precision for accumulation + + // The shape of data tile processed by a single CTA block + constexpr int tile_m = 128; + constexpr int tile_n = 128; + constexpr int tile_k = 128; + + // The shape of CTA block (number of threads) + constexpr int cta_shape_x = 256; + constexpr int cta_shape_y = 1; + constexpr int cta_shape_z = 1; + + using BLAS = decltype(cublasdx::Size() + + cublasdx::Precision() + + cublasdx::Type() + cublasdx::Function() + + cublasdx::Arrangement() + cublasdx::Block() + + cublasdx::BlockDim() + cublasdx::StaticBlockDim() + + cublasdx::MaxAlignment() + cublasdx::EnableInputStreaming() + cublasdx::WithPipeline() + + cublasdx::SM()); diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/emulation_kernels.hpp b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/emulation_kernels.hpp new file mode 100644 index 00000000..80efb8e9 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/emulation_kernels.hpp @@ -0,0 +1,86 @@ +#pragma once + +#include +#include +#include + +#include +using namespace cublasdx; + +#include + +#include +#include "slicing.hpp" + +enum class slice_matrix +{ + a, + b +}; + +template +__launch_bounds__(BlockSize, 2) __global__ void max_reduce_kernel(InTensor in_tensor, OutTensor out_tensor) { + using datatype = tutorial::tensor_value_type_t; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + const auto [tile_size_x, tile_size_y] = in_tensor.layout().shape(); + auto tid = threadIdx.x; + auto bid = blockIdx.x; + + // Assume that tensor is reduced along the last dimension + auto const row_index = tutorial::conditional_return(bid, cublasdx::slice); + auto const col_index = tutorial::conditional_return(cublasdx::slice, bid); + + auto global_tile = in_tensor(row_index, col_index); + + // 1. Find local maximum absolute value for this thread + double local_max = 0; + + auto const length = (SliceMatrix == slice_matrix::a) ? tile_size_y : tile_size_x; + for (auto i = tid; i < length; i += BlockSize) { + local_max = cuda::std::max(local_max, cuda::std::abs(global_tile(i))); + } + + // 2. Compute block-wide reduction to find maximum across all threads + __syncthreads(); + const double block_max = BlockReduce(temp_storage).Reduce(local_max, [](const auto& a, const auto& b) { + return cuda::std::max(a, b); + }); + + // 3. Convert maximum value to exponent shift and store to global memory + // This shift determines the scaling factor for slicing this row/column + if (tid == 0) { + out_tensor(bid) = max_to_exponent_shift(block_max); + } +} + +template +__launch_bounds__(BlockSize, 2) __global__ + void slice_kernel(InTensor in_tensor, ShiftTensor shift_tensor, OutTensor out_tensor, int32_t reduction_dim_size) { + using in_datatype = tutorial::tensor_value_type_t; + using out_datatype = tutorial::tensor_value_type_t; + + const auto tid = threadIdx.x + blockIdx.x * BlockSize; + + // Calculate which matrix element this thread processes + auto slow_idx = tid / reduction_dim_size; + auto fast_idx = tid % reduction_dim_size; + + auto const row_idx = (SliceMatrix == slice_matrix::a) ? slow_idx : fast_idx; + auto const col_idx = (SliceMatrix == slice_matrix::a) ? fast_idx : slow_idx; + + // Decompose the double precision value into multiple int8_t slices + // using the appropriate scaling factor for this row/column + const cuda::std::array slices = + slices_from_fp64(in_tensor(row_idx, col_idx), shift_tensor(slow_idx)); + +// Store all slices for this matrix element +#pragma unroll + for (int elem = 0; elem < Slices; ++elem) { + out_tensor(row_idx, col_idx, elem) = slices[elem]; + } +} + +#include "fused_kernel.hpp.inc" + diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/fused_kernel.hpp.inc b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/fused_kernel.hpp.inc new file mode 100644 index 00000000..4f79ff03 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/fused_kernel.hpp.inc @@ -0,0 +1,141 @@ + +template +__launch_bounds__(DevicePipeline::max_threads_per_block, 1) __global__ + void fused_epilogue_kernel(__grid_constant__ DevicePipeline const device_pipeline, + Alpha alpha, + Beta beta, + CTensor gmem_c_fp64, + tutorial::matrix_half output_half, + AShiftTensor const gmem_shift_a, + AtShiftTensor const gmem_shift_at) { + extern __shared__ __align__(device_pipeline.buffer_alignment()) char smem[]; +#ifdef __CUDA_ARCH__ + if constexpr (cublasdx::sm_of_v == __CUDA_ARCH__) { + // ================================ + // 1. SETUP AND TILE PREPARATION + // ================================ + + constexpr int tile_m = cublasdx::size_of_v_m; + constexpr int tile_n = cublasdx::size_of_v_n; + + auto const block_offset_m = blockIdx.x * tile_m; + auto const block_offset_n = blockIdx.y * tile_n; + + if ((block_offset_n > (block_offset_m + tile_m) and output_half == tutorial::matrix_half::lower) or + (block_offset_m > (block_offset_n + tile_n) and output_half == tutorial::matrix_half::upper)) { + return; + } + + constexpr auto initial_diag = Slices - 1; + constexpr auto initial_term = 0; + + auto [pipeline_smem, smem_shift_a, smem_shift_at] = + cublasdx::shared_memory::slice(smem, + device_pipeline.buffer_alignment(), + device_pipeline.buffer_size(), + cublasdx::alignment_of_v_a, + cute::make_layout(cute::Int()), + cublasdx::alignment_of_v_b, + cute::make_layout(cute::Int())); + + // Copy general purpose data + cublasdx::copy(gmem_shift_a(cute::_, blockIdx.x), smem_shift_a); + cublasdx::copy(gmem_shift_at(cute::_, blockIdx.y), smem_shift_at); + cublasdx::copy_wait(); + + + // Get pipeline tile + auto tile_pipeline = device_pipeline.get_tile(pipeline_smem, + cublasdx::make_coord(blockIdx.x, initial_term), + cublasdx::make_coord(blockIdx.y, initial_diag)); + + auto accumulator = tile_pipeline.get_accumulator(); + + // ================================ + // 2. FP64 C INPUT / OUTPUT TILE SETUP + // ================================ + + auto tile_c_fp64_gmem = cublasdx::get_tile(gmem_c_fp64, BLAS::c_shape, blockIdx.x, blockIdx.y); + + // ============================================ + // 3. OZAKI SCHEME DIAGONAL ITERATION + // ============================================ + + // Iterate over diagonals in reverse order (highest power of 2 first) + // This ensures proper accumulation order for numerical stability +# pragma unroll 1 + for (auto diag = initial_diag; diag >= 0; --diag) { + + // Initialize accumulator for this diagonal + accumulator.clear(); + + // ========================================== + // 4. SLICE COMBINATION COMPUTATION + // ========================================== + + // Compute all slice combinations that contribute to this diagonal + // For diagonal d, we compute: A_slice[i] * B_slice[d-i] for i = 0 to d +# pragma unroll 1 + for (auto term = initial_term; term <= diag; ++term) { + // ========================================= + // 5. N-STAGE MEMORY PIPELINE FOR GEMM + // ========================================= + + tile_pipeline.execute(accumulator); + + const auto next_slice_row = (term == diag) ? 0 : term + 1; // A slice index + const auto next_slice_col = (term == diag) ? (diag - 1) : (diag - next_slice_row); // B slice index + device_pipeline.reset_tile(tile_pipeline, + cublasdx::make_coord(blockIdx.x, next_slice_row), + cublasdx::make_coord(blockIdx.y, next_slice_col)); + } /* end of slice combination loop */ + + // ======================================== + // 6. RESULT RECONSTRUCTION AND EPILOGUE + // ======================================== + + + // Load existing C values + auto c_fp64_frag = accumulator.make_partition_and_copy(tile_c_fp64_gmem); + + if(accumulator.is_thread_active()) { + // Convert accumulated int32_t results back to double precision + // and apply appropriate scaling based on slice positions + auto gemm_results = accumulator.get_results(); + + + // Process each element in the register fragment +# pragma unroll + for (int i = 0; i < cublasdx::size(c_fp64_frag); ++i) { + const auto [global_x, global_y] = accumulator.map_fragment_index(i); + const auto shift_a_elem = smem_shift_a(global_x); + const auto shift_at_elem = smem_shift_at(global_y); + + auto const total_x = block_offset_m + global_x; + auto const total_y = block_offset_n + global_y; + bool const is_in_bounds = (output_half == tutorial::matrix_half::lower and (total_x >= total_y)) or + (output_half == tutorial::matrix_half::upper and (total_y >= total_x)); + + // Convert int32_t slice result back to double precision + // with appropriate scaling for this diagonal and element + double const scaled_unsliced_value = + alpha * nth_slice_to_fp64(diag, gemm_results(i), shift_a_elem + shift_at_elem); + + c_fp64_frag(i) = is_in_bounds ? (scaled_unsliced_value + (diag == initial_diag ? beta : 1.0) * c_fp64_frag(i)) : c_fp64_frag(i); + } + + } + + accumulator.partition_and_copy(c_fp64_frag, tile_c_fp64_gmem); + } + + } +#endif +} diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/kernel_config.hpp.inc b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/kernel_config.hpp.inc new file mode 100644 index 00000000..c4d09827 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/kernel_config.hpp.inc @@ -0,0 +1,19 @@ + + dim3 grid(shape_a_rows / static_tile_m(), shape_a_rows / static_tile_n()); + auto kernel = fused_epilogue_kernel; + + auto shared_memory_size = cublasdx::make_shared_storage_calculator() + .add(device_pipeline.buffer_alignment(), device_pipeline.buffer_size()) + .add(16, sizeof(int32_t), static_tile_m()) // shift_a + .add(16, sizeof(int32_t), static_tile_n()) // shift_b + .get(); + + CUDA_CHECK_AND_EXIT( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/kernel_launch.hpp.inc b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/kernel_launch.hpp.inc new file mode 100644 index 00000000..1b5d93ad --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/kernel_launch.hpp.inc @@ -0,0 +1,3 @@ + + kernel<<>>( + device_pipeline, alpha, beta, tensor_c, output_half, tensor_shift_a, tensor_shift_at); diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/parameters.hpp.inc b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/parameters.hpp.inc new file mode 100644 index 00000000..f7b5d4e5 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/parameters.hpp.inc @@ -0,0 +1,24 @@ + + int const warm_up_runs = 10; + int const kernel_runs = 100; + + // =================================== + // Problem configuration + // =================================== + + // (syrk_n, syrk_k, alpha, beta, uplo) + std::vector problems = { + {2048, 2048, 0.9, 1.1, tutorial::matrix_half::upper}, + {2048, 2048, 0.9, 1.1, tutorial::matrix_half::lower} + }; + + + // =================================== + // Global SYRK configuration + // =================================== + + // The number of slices used in emulation algorithm + // More slices = higher precision but more computation + constexpr unsigned slices = 7; + + bool const debug = false; diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/pipeline_config.hpp.inc b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/pipeline_config.hpp.inc new file mode 100644 index 00000000..a7afbe64 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/pipeline_config.hpp.inc @@ -0,0 +1,6 @@ + + constexpr unsigned pipeline_depth = 2; + + auto device_pipeline = cublasdx::suggest_device_pipeline( + tensor_slice_a, tensor_slice_at) + .value(); diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/slicing.hpp b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/slicing.hpp new file mode 100644 index 00000000..c39cbfcd --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/slicing.hpp @@ -0,0 +1,221 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#define SLICING_FUNCTION __host__ __device__ __forceinline__ + +union double_structure { + double d; + struct float64 { + unsigned int mantissa_lo : 32; + unsigned int mantissa_hi : 20; + unsigned int exponent : 11; + unsigned int sign : 1; + } s; +}; + +static constexpr int bias = 1023; + +/* + * Signed magnitudes of length N only allow for (N-1) of effective storage. + */ +template +SLICING_FUNCTION constexpr int get_width() { + if constexpr (cuda::std::is_signed()) { + return 8 * sizeof(T) - 1; + } else { + return 8 * sizeof(T); + }; +} + +SLICING_FUNCTION int64_t div_up(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + +SLICING_FUNCTION int rz_width(const double_structure& em) { + return em.s.exponent + 1 - bias; +} +// Length of a bits before the decimal point. i.e., bit width if casted to infinite-length int type. +SLICING_FUNCTION int rz_width(const double d) { + double_structure em {d}; + return rz_width(em); +} + +SLICING_FUNCTION constexpr int64_t ipow_p(int64_t base, int exp, int64_t ans = 1) { + return exp < 1 ? ans : ipow_p(base * base, exp / 2, (exp % 2) ? ans * base : ans); +} + +SLICING_FUNCTION constexpr double ipow(int base, int exp) { + return exp > 0 ? ipow_p(base, exp) : 1.0 / ipow_p(base, -exp); +} + +template +SLICING_FUNCTION constexpr int max_exponent(); + +// scale to numbers no bigger than 256 +template<> +SLICING_FUNCTION constexpr int max_exponent() { + return 8; +} + +// scale to numbers no bigger than 128 +template<> +SLICING_FUNCTION constexpr int max_exponent() { + return 7; +} + +SLICING_FUNCTION int32_t get_exponent(double val) { + double_structure em = {val}; + + int em_exponent = (em.s.exponent + 1 - bias); + + if (em.s.mantissa_hi & (63 << 14) == (63 << 14)) + em_exponent++; + + return em_exponent; +} + +// An implementation of ldexp() to be used in scaling double-precision numbers obtained from unpacking slices +// in the epilogue. The resulting double values must be finite and normalized, and so the fast path should +// simply adjust the exponent field of the value, so long as the result is also finite and normalized. +SLICING_FUNCTION void epilogue_ldexp(double_structure& em, int exp) { + static constexpr int exp_max = bias - 1; + int previous_exp_biased = static_cast(em.s.exponent); + if (0 < previous_exp_biased && 0 < previous_exp_biased + exp && previous_exp_biased + exp <= exp_max + bias) { + em.s.exponent += exp; + return; + } + em.d = ldexp(em.d, exp); +} + +/* + * Returns the exponent shift to be applied to a row/colum + * based on the max(abs()) on that row/column. + * Naively, this scaling factor would be just the exponent + * of max(abs()) but we do some other tricks to account for + * the encoding of the signed magnitude only on the leading + * slice among other things.. + */ +SLICING_FUNCTION int32_t max_to_exponent_shift(double row_col_max) { + static constexpr int scale_max_exponent = max_exponent(); + + return scale_max_exponent - get_exponent(row_col_max); +} + +/* + * slices up the input value "val" in "nslices" of type "SliceValueType". + * Before slicing the number, the exponent of "val" is shifted based on + * the value of "exponent_shift" which has been computed using the + * max_to_exponent_shift function based on the max(abs()) of the relevant + * row/column of A/B. + * + * On exit, the first value of the returned array contains the most + * significant slice. + */ +template +SLICING_FUNCTION cuda::std::array slices_from_fp64(double val, int32_t exponent_shift) { + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + + cuda::std::array slices = {0}; + + static constexpr double normalization_factor = 0x1.0p52; + // Normalise denormalised numbers, but store the effective exponent in its own variable, + // allowing for representation of fp64 denorms as normalised numbers. + + int skip_slices = 0; + int64_t r = 0; + + uint8_t reg_pack = 0; + + double_structure r0 = {val}; + int denorm_compensation = 0; + if (r0.s.exponent == 0) { + if (r0.d == 0.0) { + skip_slices = nslices; + r = 0; + } else { + /* round to nearest is the default behavior on CPU */ + r0.d = (r0.d * normalization_factor); + denorm_compensation = -52; + } + } + int exp = r0.s.exponent + exponent_shift + denorm_compensation - bias; + exp += (nslices - 1) * get_width(); // Use all 8 bits. + + // Adjust casting range. + int extra_width = (exp + 1) - 63; + extra_width = extra_width > 0 ? extra_width : 0; + skip_slices = div_up(extra_width, get_width()); + exp -= skip_slices * get_width(); + + // Handle exp outside of double range + if (exp < 0) { + r = 0; + } else { + r0.s.exponent = (unsigned int)(exp + bias); + r = static_cast(r0.d); + } + + for (int64_t _i = 0; _i < nslices; _i++) { + int64_t i = nslices - 1 - _i; + + if (_i < skip_slices) { + reg_pack = 0; + } else { + reg_pack = static_cast(r); + slices[i] = static_cast(reg_pack); + r = (r >> get_width()) + (reg_pack >> get_width()); + } + } + + return slices; +} + +/* + * This function is a building block to reconstruct an FP64 number from the slices. + * Instead of receiving the set of slices and adding them to an FP64 number, + * this function gets a single slice (the NTH-slice) and returns it as an FP64 value. + * + * In this way, one could use this function to compute and accumulate the contributions + * from the slices separately. + * + * Remark: when reconstructing an FP64 number accumulate the least significant + * diagonals first to avoid catastrophic cancellation. + */ +template +SLICING_FUNCTION double nth_slice_to_fp64(int32_t nth, DiagonalAccType nth_slice, int32_t exponent_shift) { + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + static_assert(cuda::std::is_integral()); + static_assert(cuda::std::is_signed()); + assert(nth >= 0); + + /* In some instances, we use the unsigned value type to leverage all bits for storage */ + double ko = pow(2.0, -get_width>() * nth); + + double value_i = ko * static_cast(nth_slice); + double_structure value = {value_i}; + epilogue_ldexp(value, -exponent_shift); + return value.d; +} diff --git a/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/syrk_emulation.cu b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/syrk_emulation.cu new file mode 100644 index 00000000..11faf6d8 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/04-Challenge-Exercises/cpp/3a_fused_syrk_emulation/syrk_emulation.cu @@ -0,0 +1,264 @@ +// std libraries +#include + +// cuda std libraries +#include +#include + +// cuda libraries +#include + +// utility headers +#include +#include +#include + +#include "slicing.hpp" +#include "emulation_kernels.hpp" + +template +auto run_tutorial_kernel(double alpha, + ATensor const& tensor_a, + double beta, + CTensor const& tensor_c, + tutorial::matrix_half output_half, + cudaStream_t stream = 0, + unsigned warm_up_runs = 10, + unsigned kernel_runs = 100, + bool debug = false) { + + float total_time = 0.f; + + using shift_t = int32_t; + using slice_value_type = typename BLAS::a_value_type; + using accumulator_value_type = typename BLAS::c_value_type; + + // Number of slices per elements + auto const static_slices = cuda::std::integral_constant {}; + + // Create slice tensor A: [n, k, slices] - stores int8_t slices of matrix A + auto const [shape_a_rows_, shape_a_cols_] = tensor_a.layout().shape(); + int const shape_a_rows = shape_a_rows_; + int const shape_a_cols = shape_a_cols_; + constexpr auto arr_a = cublasdx::arrangement_of_v_a; + auto d_slice_a_storage = + tutorial::get_empty_device_tensor(shape_a_rows, shape_a_cols, static_slices); + // Capturing a structured binding into lambda is a C++20 feature + auto tensor_slice_a = cuda::std::get<1>(d_slice_a_storage); + + // Construct a transposed view of A slice tensor + auto [stride_n, stride_k, stride_slices] = tensor_slice_a.stride(); + auto const at_shape = cuda::std::make_tuple(shape_a_cols, shape_a_rows, static_slices); + auto const at_stride = cuda::std::make_tuple(stride_k, stride_n, stride_slices); + auto tensor_slice_at = + tutorial::make_gmem_tensor_from_tuples(tutorial::raw_pointer_cast(tensor_slice_a.data()), at_shape, at_stride); + + constexpr auto shift_arr = cublasdx::col_major; + + // Create tensors for the shift values with proper tiling structure + auto const static_tile_m = cuda::std::integral_constant> {}; + auto d_shift_storage = + tutorial::get_empty_device_tensor(static_tile_m, shape_a_rows / static_tile_m()); + auto tensor_shift_a = cuda::std::get<1>(d_shift_storage); + + auto const static_tile_n = cuda::std::integral_constant> {}; + auto const shift_at_shape = cuda::std::make_tuple(static_tile_n, shape_a_rows / static_tile_n()); + auto tensor_shift_at = tutorial::make_gmem_tensor_from_tuples( + tutorial::raw_pointer_cast(tensor_shift_a.data()), shift_at_shape); + + auto const static_tile_k = cuda::std::integral_constant> {}; + + // Execute preprocessing kernels to find maximum values and compute scaling factors + { + auto run_preprocessing = [&](auto str) { + // Find max absolute value in each row of A and convert to exponent shift + constexpr int reduction_block_size = 64; + max_reduce_kernel + <<>>(tensor_a, tensor_shift_a); + }; + + auto time_ms = tutorial::measure::execution(run_preprocessing, warm_up_runs, kernel_runs, stream); + + total_time += time_ms; + if (debug) { + std::cout << "----> Custom Preprocess time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 3: SLICING */ + /* Slice up input A and B matrices */ + /* ============================================================== */ + + // This step decomposes each double precision value into multiple int8_t slices. + // For a double precision value x with scaling factor s, we create slices such that: + // x ≈ Σ(i=0 to slices-1) slice_i * 2^(s - i*8) + // where each slice_i is an int8_t value. + + { + + auto run_slicing = [&](auto str) { + constexpr auto slice_kernel_block_size = 64; + slice_kernel + <<>>( + tensor_a, tensor_shift_a, tensor_slice_a, shape_a_cols); + }; + + auto time_ms = tutorial::measure::execution(run_slicing, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Slice time: " << time_ms << " ms" << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + /* ============================================================== */ + /* OZAKI SCHEME STEP 4: MATRIX MULTIPLICATION */ + /* Product of slices */ + /* ============================================================== */ + + // This is the core of the Ozaki scheme. We need to compute the product: + // C = A * B = (Σ A_i * 2^shift_A_i) * (Σ B_j * 2^shift_B_j) + // = ΣΣ A_i * B_j * 2^(shift_A_i + shift_B_j) + // + // We compute this as multiple GEMM operations between slice combinations, + // with each result scaled appropriately and accumulated into the final result. + + { + #include "pipeline_config.hpp.inc" + + #include "kernel_config.hpp.inc" + + auto dummy_c_storage = tutorial::get_copy_tensor(tensor_c); + auto dummy_tensor_c = cuda::std::get<1>(dummy_c_storage); + + auto run_fused_epilogue = [&](auto str) { + kernel<<>>( + device_pipeline, alpha, beta, dummy_tensor_c, output_half, tensor_shift_a, tensor_shift_at); + }; + + auto time_ms = tutorial::measure::execution(run_fused_epilogue, warm_up_runs, kernel_runs, stream); + total_time += time_ms; + + if (debug) { + std::cout << "----> Custom Epilogue time: " << time_ms << " ms" << std::endl; + } + + // Run correctness check + #include "kernel_launch.hpp.inc" + + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaStreamSynchronize(stream)); + } + + std::vector results(tensor_c.size()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(results.data(), + tutorial::raw_pointer_cast(tensor_c.data()), + tensor_c.size() * sizeof(double), + cudaMemcpyDeviceToHost)); + + // performance runs + auto avg_tflops = tutorial::real_syrk_tflops(shape_a_rows, shape_a_cols) / total_time; + return cuda::std::make_tuple(total_time, avg_tflops, results); +} + +int main(int argc, char** argv) { + using alpha_value_type = double; + using beta_value_type = double; + + constexpr auto arrangement_a = cublasdx::row_major; + constexpr auto arrangement_c = cublasdx::col_major; + + // Automatically choose transposed + constexpr auto arrangement_a_t = (arrangement_a == cublasdx::col_major) ? cublasdx::row_major : cublasdx::col_major; + + #include "parameters.hpp.inc" + + for (tutorial::syrk_problem_t problem : problems) { + int const n = problem.n; + int const k = problem.k; + double const alpha = problem.alpha; + double const beta = problem.beta; + tutorial::matrix_half const output_half = problem.uplo; + + std::cout << "Computing SYRK N=" << n << " K=" << k + << " uplo=" << (output_half == tutorial::matrix_half::upper ? "upper" : "lower") + << " (slices=" << slices << ")\n"; + + // =================================== + // Ozaki scheme configuration + // =================================== + + #include "cublasdx_config.hpp.inc" + + if (n % tile_n != 0 or k % tile_k != 0) { + std::cerr << "Problem shape must be divisible by tile shape" << std::endl; + exit(-1); + } + + // =================================== + // Data type definitions + // =================================== + + using a_value_type = double; + using b_value_type = double; + using c_value_type = double; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + if (debug) { + tutorial::print_device_properties(); + } + + /* ============================================================== */ + /* Input FP64 (host) tensors */ + /* ============================================================== */ + static const float range_lower_bound = 1.0f / 3.14f; + static const float range_upper_bound = 52.0f / 3.14f; + int seed = 1234; + constexpr tutorial::random_distribution dist = tutorial::random_distribution::uniform; + + auto [vector_a, tensor_a] = tutorial::get_random_device_tensor( + n, k, range_lower_bound, range_upper_bound, seed); + + auto [vector_c_custom, tensor_c_custom] = tutorial::get_symmetric_random_device_tensor( + n, output_half, range_lower_bound, range_upper_bound, seed + 1); + auto [vector_c_reference, tensor_c_reference] = tutorial::get_copy_tensor(tensor_c_custom); + + auto [time_reference, tflops_reference, results_reference] = tutorial::cublaslt_reference( + alpha, tensor_a, beta, tensor_c_reference, output_half, stream, warm_up_runs, kernel_runs); + + auto [time_tutorial, tflops_tutorial, results_tutorial] = run_tutorial_kernel( + alpha, tensor_a, beta, tensor_c_custom, output_half, stream, warm_up_runs, kernel_runs); + + /* ========================================================================================= */ + /* Print summary of performance and correctness results */ + /* ========================================================================================= */ + std::cout << "\nCustom Emulation Kernel (fused SYRK)\n"; + std::cout << std::fixed << std::setprecision(4); + std::cout << "Avg time [ms] = " << time_tutorial << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_tutorial << "\n"; + + std::cout << "\ncuBLASLt (not including heuristic)\n"; + std::cout << "Avg time [ms] = " << time_reference << "\n"; + std::cout << "Avg TFLOP/s = " << tflops_reference << "\n\n"; + + constexpr bool verbose_knob = false; + constexpr bool print_knob = true; + + auto error = tutorial::calculate_error(results_tutorial, results_reference, verbose_knob, print_knob); + std::cout << std::fixed << std::setprecision(10) << "Total relative error = " << error << "\n"; + + std::cout << std::fixed << std::setprecision(2) << (tflops_tutorial / tflops_reference) * 100 + << "% reference performance \n\n"; + } + + return 0; +} diff --git a/tutorials/floating-point-emulation/notebooks/utilities/python/_nvmath_fixup.py b/tutorials/floating-point-emulation/notebooks/utilities/python/_nvmath_fixup.py new file mode 100644 index 00000000..c2c1ca9c --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/utilities/python/_nvmath_fixup.py @@ -0,0 +1,9 @@ +def _monkey_patch_cublasdx_numba(): + # WAR: (will be fixed int 0.8.1+) + from nvmath.device import cublasdx_numba as _cublasdx_numba_monkey_patch + if "with_pipeline" not in _cublasdx_numba_monkey_patch._BLAS_DEFINITION_ARGS: + _cublasdx_numba_monkey_patch._BLAS_DEFINITION_ARGS += [ + "with_pipeline", "enable_input_streaming", "static_block_dim" + ] + +_monkey_patch_cublasdx_numba() diff --git a/tutorials/floating-point-emulation/notebooks/utilities/python/benchmark.py b/tutorials/floating-point-emulation/notebooks/utilities/python/benchmark.py new file mode 100644 index 00000000..f0f0994a --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/utilities/python/benchmark.py @@ -0,0 +1,808 @@ +import numpy as np +import cupy as cp +from numba import cuda +import nvmath.linalg +from nvmath.bindings import cublas +from common import random_real, mm_perf_GFlops +from common_numba import time_numba, set_max_dynamic_shared_size_bytes +from common_cupy import time_cupy +from emulation_slicing_impl import get_slice_kernel, get_max_reduce_kernel, slice_matrix +from emulation_utils import MatrixHalf + + +def _calculate_accuracy_metrics(h_result, h_reference): + """Calculate various accuracy metrics comparing result to reference.""" + result_norm = np.linalg.norm(h_result) + reference_norm = np.linalg.norm(h_reference) + + diff = h_result - h_reference + abs_diff = np.abs(diff) + + # Relative error per element (avoiding division by zero) + with np.errstate(divide='ignore', invalid='ignore'): + relative_errors = abs_diff / np.abs(h_reference) + relative_errors = np.where(np.isfinite(relative_errors), relative_errors, 0) + + avg_relative_error = np.mean(relative_errors) + max_relative_error = np.max(relative_errors) + max_absolute_error = np.max(abs_diff) + total_relative_error = np.linalg.norm(diff) / reference_norm + + return { + 'result_norm': result_norm, + 'reference_norm': reference_norm, + 'avg_relative_error': avg_relative_error, + 'max_relative_error': max_relative_error, + 'max_absolute_error': max_absolute_error, + 'total_relative_error': total_relative_error + } + + +def _print_accuracy_metrics(metrics): + """Print accuracy metrics in standard format.""" + print(f"Vector reference norm: [{metrics['reference_norm']:.5e}]") + print(f"Vector result norm: [{metrics['result_norm']:.5e}]") + print(f"Vector relative error: [{metrics['total_relative_error']:.5e}]") + print(f"Average relative error: [{metrics['avg_relative_error']:.5e}]") + print(f"Maximum relative error: [{metrics['max_relative_error']:.5e}]") + print(f"Maximum absolute error: [{metrics['max_absolute_error']:.5e}]") + print(f"Total relative error = {metrics['total_relative_error']:.10f}") + + +def _print_performance(time_ms, tflops, label="Custom Kernel"): + """Print performance metrics in standard format.""" + print(label) + print(f"Avg time [ms] = {time_ms:.4f}") + print(f"Avg TFLOP/s = {tflops:.4f}\n") + + +def benchmark_dgemm( + shapes, + get_kernel_func, + kernel_state_func, + grid_func, + block_func, + kernel_args_func=None, + shared_mem_func=None, + validation_func=None, + repeats=10 +): + """ + Generic DGEMM benchmark framework. + + Args: + shapes: List of tuples (m, n, k, alpha, beta) defining problem sizes + get_kernel_func: Function that returns the kernel. Called as get_kernel_func(state) + kernel_state_func: Function(m, n, k, alpha, beta) -> state object (can be anything) + grid_func: Function(m, n, k, state) -> grid dimensions tuple + block_func: Function(m, n, k, state) -> block dimensions tuple + kernel_args_func: Optional function(state, alpha, d_A, d_B, beta, d_C) -> tuple of kernel args. + Defaults to (alpha, d_A, d_B, beta, d_C). + shared_mem_func: Optional function(state, kernel_args) -> shared memory size in bytes. Defaults to 0. + validation_func: Optional function(m, n, k, alpha, beta, block) to validate parameters + repeats: Number of timing iterations + """ + for shape in shapes: + m, n, k, alpha, beta = shape + + # Create state that can be shared across kernel/grid/block functions + state = kernel_state_func(m, n, k, alpha, beta) + + # Get block and grid using the state + block = block_func(m, n, k, state) + grid = grid_func(m, n, k, state) + + print(f"\nComputing GEMM M={m} N={n} K={k}\n") + + # Default validation + if validation_func: + validation_func(m, n, k, alpha, beta, block) + else: + assert m % block[0] == 0, "Invalid M dimension for block size" + assert n % block[1] == 0, "Invalid N dimension for block size" + assert k % 16 == 0, "Invalid K dimension" + + # Prepare data + h_A = random_real((m, k), np.float64, order='C') + h_B = random_real((k, n), np.float64, order='F') + h_C = random_real((m, n), np.float64, order='F') + + d_A_cp = cp.array(h_A) + d_B_cp = cp.array(h_B) + d_C_cp = cp.array(h_C) + + d_A = cuda.as_cuda_array(d_A_cp) + d_B = cuda.as_cuda_array(d_B_cp) + d_C = cuda.as_cuda_array(d_C_cp) + + # Get kernel using the state + dgemm_kernel = get_kernel_func(state) + + # Prepare kernel arguments + if kernel_args_func: + kernel_args = kernel_args_func(state, alpha, d_A, d_B, beta, d_C) + else: + kernel_args = (alpha, d_A, d_B, beta, d_C) + + # Calculate shared memory size (after kernel args are created) + shared_mem_size = shared_mem_func(state, kernel_args) if shared_mem_func else 0 + + # Set max dynamic shared memory size if needed + if shared_mem_size > 0: + set_max_dynamic_shared_size_bytes(dgemm_kernel, shared_mem_size, *kernel_args) + + # Run custom kernel once to get result + dgemm_kernel[grid, block, 0, shared_mem_size](*kernel_args) + cuda.synchronize() + + # Run reference + d_CRef_cp = cp.array(h_C) + d_CRef_cp = nvmath.linalg.matmul(d_A_cp, d_B_cp, c=d_CRef_cp, alpha=alpha, beta=beta) + + # Calculate accuracy + h_C = cp.asnumpy(d_C_cp) + h_CRef = cp.asnumpy(d_CRef_cp) + metrics = _calculate_accuracy_metrics(h_C, h_CRef) + + # Time custom kernel (need to recreate args for timing since they may be modified) + if kernel_args_func: + # For timing, we need to pass a function that creates fresh args each time + def get_timing_args(): + return kernel_args_func(state, alpha, d_A, d_B, beta, d_C) + # time_numba expects individual args, so we'll need to unpack + # We'll time with the kernel_args we already have + time_ms = time_numba( + dgemm_kernel, + grid, + block, + shared_mem_size, + repeats, + *kernel_args) + else: + time_ms = time_numba( + dgemm_kernel, + grid, + block, + shared_mem_size, + repeats, + alpha, + d_A, + d_B, + beta, + d_C) + + tflops = mm_perf_GFlops((m, n, k), 1, time_ms) / 1000.0 + _print_performance(time_ms, tflops, "Custom Kernel") + + # Time reference (cuBLASLt via nvmath) + d_CRef2_cp = cp.array(h_C) + def matmul_ref(): + return nvmath.linalg.matmul(d_A_cp, d_B_cp, c=d_CRef2_cp, alpha=alpha, beta=beta) + + ref_time_ms = time_cupy(matmul_ref, repeats) + ref_tflops = mm_perf_GFlops((m, n, k), 1, ref_time_ms) / 1000.0 + _print_performance(ref_time_ms, ref_tflops, "cuBLASLt (not including heuristic)") + + # Print accuracy information + _print_accuracy_metrics(metrics) + + # Calculate performance percentage + performance_pct = (ref_time_ms / time_ms) * 100.0 + print(f"{performance_pct:.2f}% reference performance ") + + +def benchmark_emulated_dgemm( + shapes, + setup_func, + gemm_func, + epilogue_func=None, + allocate_products=False, + allocate_diagonals=False, + num_products_func=None, + slices=7, + repeats=10 +): + """ + Benchmark framework for emulated DGEMM. + + Handles slicing and allocations, then calls user-provided GEMM and epilogue functions. + Assumes fixed layout: row_major A, col_major B, col_major C. + + Args: + shapes: List of tuples (m, n, k, alpha, beta) defining problem sizes + gemm_func: Function(d_A_sliced, d_B_sliced, d_products/d_diag, d_shift_a, d_shift_b) + that performs the GEMM operation + epilogue_func: Optional function(slices, d_products/d_diag, d_shift_a, d_shift_b, d_C, alpha, beta) + for reconstruction. Other dimensions are inferred from tensor shapes. + allocate_products: If True, allocates products tensor (M×N×num_products) for unfused + allocate_diagonals: If True, allocates diagonal tensor (M×N×slices) for partial fused + num_products_func: Optional function(slices) -> int for number of products. + Defaults to slices*(slices+1)//2 if allocate_products=True, else slices. + slices: Number of slices (default: 7) + repeats: Number of timing iterations + """ + for shape in shapes: + m, n, k, alpha, beta = shape + + context = setup_func(m, n, k) + + block_size = 64 # Default block size for slicing kernels + + print(f"\nComputing Emulated GEMM M={m} N={n} K={k} (slices={slices})\n") + + # Prepare data - fixed layout: row_major A, col_major B, col_major C + h_A = random_real((m, k), np.float64, order='C') # row_major + h_B = random_real((k, n), np.float64, order='F') # col_major + h_C = random_real((m, n), np.float64, order='F') # col_major + + d_A_cp = cp.array(h_A) + d_B_cp = cp.array(h_B) + d_C_cp = cp.array(h_C) + + d_A = cuda.as_cuda_array(d_A_cp) + d_B = cuda.as_cuda_array(d_B_cp) + d_C = cuda.as_cuda_array(d_C_cp) + + # Allocate sliced tensors with fixed strides + # A: row_major -> strides = (K, 1, M*K) + # B: col_major -> strides = (1, K, K*N) + itemsize = np.dtype(np.int8).itemsize + strides_a = (k * itemsize, 1 * itemsize, m * k * itemsize) + + d_A_sliced_cp = cp.ndarray( + shape=(m, k, slices), + dtype=np.int8, + memptr=cp.cuda.alloc(m * k * slices * itemsize), + strides=strides_a + ) + d_A_sliced = cuda.as_cuda_array(d_A_sliced_cp) + + strides_b = (1 * itemsize, k * itemsize, k * n * itemsize) + + d_B_sliced_cp = cp.ndarray( + shape=(k, n, slices), + dtype=np.int8, + memptr=cp.cuda.alloc(k * n * slices * itemsize), + strides=strides_b + ) + d_B_sliced = cuda.as_cuda_array(d_B_sliced_cp) + + # Allocate shift tensors + d_shift_a = cuda.device_array(m, dtype=np.int32) + d_shift_b = cuda.device_array(n, dtype=np.int32) + + # Allocate products or diagonal tensor based on flags + itemsize_int32 = np.dtype(np.int32).itemsize + d_products = None + d_diagonals = None + + if allocate_products: + # Allocate products tensor for unfused (M×N×num_products) + num_products = num_products_func(slices) if num_products_func else slices * (slices + 1) // 2 + # C is col_major -> strides = (1, M, M*N) + strides_products = (1 * itemsize_int32, m * itemsize_int32, m * n * itemsize_int32) + + d_products_cp = cp.ndarray( + shape=(m, n, num_products), + dtype=np.int32, + memptr=cp.cuda.alloc(m * n * num_products * itemsize_int32), + strides=strides_products + ) + d_gemm_out_cp = d_products_cp + d_gemm_out_numba = cuda.as_cuda_array(d_products_cp) + elif allocate_diagonals: + # Allocate diagonal tensor for partial fusion (M×N×slices) + num_diag = num_products_func(slices) if num_products_func else slices + # C is col_major -> strides = (1, M, M*N) + strides_diag = (1 * itemsize_int32, m * itemsize_int32, m * n * itemsize_int32) + + d_diag_cp = cp.ndarray( + shape=(m, n, num_diag), + dtype=np.int32, + memptr=cp.cuda.alloc(m * n * num_diag * itemsize_int32), + strides=strides_diag + ) + d_gemm_out_cp = d_diag_cp + d_gemm_out_numba = cuda.as_cuda_array(d_diag_cp) + else: + d_gemm_out_cp = d_C_cp + d_gemm_out_numba = d_C + + # Get slicing kernels + max_reduce_a = get_max_reduce_kernel(slice_matrix.a, BlockSize=block_size) + max_reduce_b = get_max_reduce_kernel(slice_matrix.b, BlockSize=block_size) + slice_kernel_a = get_slice_kernel(slice_matrix.a, Slices=slices, BlockSize=block_size) + slice_kernel_b = get_slice_kernel(slice_matrix.b, Slices=slices, BlockSize=block_size) + + # Perform slicing + blocks_a = m + blocks_b = n + max_reduce_a[blocks_a, block_size](d_A, d_shift_a) + max_reduce_b[blocks_b, block_size](d_B, d_shift_b) + + num_elements_a = m * k + num_elements_b = k * n + blocks_slice_a = (num_elements_a + block_size - 1) // block_size + blocks_slice_b = (num_elements_b + block_size - 1) // block_size + + slice_kernel_a[blocks_slice_a, block_size](d_A, d_shift_a, d_A_sliced, k) + slice_kernel_b[blocks_slice_b, block_size](d_B, d_shift_b, d_B_sliced, n) + + # Run GEMM function (pass CuPy arrays) + gemm_func(d_A_sliced_cp, d_B_sliced_cp, d_gemm_out_cp, d_shift_a, d_shift_b, alpha, beta, context) + + # Run epilogue if provided (pass Numba arrays for kernel compatibility) + if epilogue_func: + epilogue_func(slices, d_gemm_out_numba, d_shift_a, d_shift_b, d_C, alpha, beta, context) + + cuda.synchronize() + + # Run reference + d_CRef_cp = cp.array(h_C) + d_CRef_cp = nvmath.linalg.matmul(d_A_cp, d_B_cp, c=d_CRef_cp, alpha=alpha, beta=beta) + + # Calculate accuracy + h_C = cp.asnumpy(d_C_cp) + h_CRef = cp.asnumpy(d_CRef_cp) + metrics = _calculate_accuracy_metrics(h_C, h_CRef) + + # Time slicing separately + # Start with a warmup + max_reduce_a[blocks_a, block_size](d_A, d_shift_a) + max_reduce_b[blocks_b, block_size](d_B, d_shift_b) + slice_kernel_a[blocks_slice_a, block_size](d_A, d_shift_a, d_A_sliced, k) + slice_kernel_b[blocks_slice_b, block_size](d_B, d_shift_b, d_B_sliced, n) + + start = cp.cuda.Event() + end = cp.cuda.Event() + start.record() + for _ in range(repeats): + max_reduce_a[blocks_a, block_size](d_A, d_shift_a) + max_reduce_b[blocks_b, block_size](d_B, d_shift_b) + slice_kernel_a[blocks_slice_a, block_size](d_A, d_shift_a, d_A_sliced, k) + slice_kernel_b[blocks_slice_b, block_size](d_B, d_shift_b, d_B_sliced, n) + end.record() + end.synchronize() + slicing_time_ms = cp.cuda.get_elapsed_time(start, end) / repeats + + # Warmup GEMM + gemm_func(d_A_sliced_cp, d_B_sliced_cp, d_gemm_out_cp, d_shift_a, d_shift_b, alpha, beta, context) + + # Time GEMM separately + start.record() + for _ in range(repeats): + gemm_func(d_A_sliced_cp, d_B_sliced_cp, d_gemm_out_cp, d_shift_a, d_shift_b, alpha, beta, context, warmup=False) + end.record() + end.synchronize() + gemm_time_ms = cp.cuda.get_elapsed_time(start, end) / repeats + + # Time epilogue separately (if present) + epilogue_time_ms = 0.0 + if epilogue_func: + # Warmup epilogue + epilogue_func(slices, d_gemm_out_numba, d_shift_a, d_shift_b, d_C, alpha, beta, context) + start.record() + for _ in range(repeats): + epilogue_func(slices, d_gemm_out_numba, d_shift_a, d_shift_b, d_C, alpha, beta, context) + end.record() + end.synchronize() + epilogue_time_ms = cp.cuda.get_elapsed_time(start, end) / repeats + + # Calculate E2E time + e2e_time_ms = slicing_time_ms + gemm_time_ms + epilogue_time_ms + + # Print timing breakdown + print("Emulated GEMM Timing Breakdown") + print(f"Slicing time [ms] = {slicing_time_ms:.4f}") + print(f"GEMM time [ms] = {gemm_time_ms:.4f}") + if epilogue_func: + print(f"Epilogue time [ms] = {epilogue_time_ms:.4f}") + print(f"E2E time [ms] = {e2e_time_ms:.4f}") + + e2e_tflops = mm_perf_GFlops((m, n, k), 1, e2e_time_ms) / 1000.0 + print(f"E2E TFLOP/s = {e2e_tflops:.4f}\n") + + # Time reference (cuBLASLt via nvmath) + d_CRef2_cp = cp.array(h_C) + def matmul_ref(): + return nvmath.linalg.matmul(d_A_cp, d_B_cp, c=d_CRef2_cp, alpha=alpha, beta=beta) + + ref_time_ms = time_cupy(matmul_ref, repeats) + ref_tflops = mm_perf_GFlops((m, n, k), 1, ref_time_ms) / 1000.0 + _print_performance(ref_time_ms, ref_tflops, "cuBLASLt (not including heuristic)") + + # Print accuracy information + _print_accuracy_metrics(metrics) + + # Calculate performance percentage + performance_pct = (ref_time_ms / e2e_time_ms) * 100.0 + print(f"{performance_pct:.2f}% reference performance ") + + +# Convenience wrappers for specific exercises +def benchmark_dgemm_2_1(shapes, get_dgemm_kernel, choose_kernel_params, repeats=10): + """Exercise 2.1: Variable block size, kernel factory takes block dimensions.""" + return benchmark_dgemm( + shapes=shapes, + get_kernel_func=lambda state: get_dgemm_kernel(*state), + kernel_state_func=lambda m, n, k, alpha, beta: choose_kernel_params(m, n, k, alpha, beta), + grid_func=lambda m, n, k, state: (m // state[0], n // state[1]), + block_func=lambda m, n, k, state: state, + repeats=repeats + ) + + +def benchmark_dgemm_2_2(shapes, get_dgemm_kernel, repeats=10): + """Exercise 2.2: Fixed 16x16 block size, kernel factory takes no arguments.""" + return benchmark_dgemm( + shapes=shapes, + get_kernel_func=lambda state: get_dgemm_kernel(), + kernel_state_func=lambda m, n, k, alpha, beta: (16, 16), + grid_func=lambda m, n, k, state: (m // state[0], n // state[1]), + block_func=lambda m, n, k, state: state, + repeats=repeats + ) + + +def benchmark_dgemm_2_3(shapes, get_dgemm_kernel, choose_kernel_params, shared_mem_func, kernel_args_func, repeats=10): + """Exercise 2.4: Using cublasDx with pipeline API.""" + + def validation_func(m, n, k, alpha, beta, block): + pass + + return benchmark_dgemm( + shapes=shapes, + get_kernel_func=get_dgemm_kernel, + kernel_state_func=lambda m, n, k, alpha, beta: choose_kernel_params(m, n, k, alpha, beta), + grid_func=lambda m, n, k, BLAS: (m // BLAS.c_dim[0], n // BLAS.c_dim[1]), + block_func=lambda m, n, k, BLAS: (BLAS.block_size, 1), + kernel_args_func=kernel_args_func, + shared_mem_func=shared_mem_func, + validation_func=validation_func, + repeats=repeats + ) + + +def benchmark_dgemm_2_4(shapes, get_dgemm_kernel, choose_kernel_params, shared_mem_func, repeats=10): + """Exercise 2.3: Using cublasDx.""" + + def validation_func(m, n, k, alpha, beta, block): + pass + + return benchmark_dgemm( + shapes=shapes, + get_kernel_func=get_dgemm_kernel, + kernel_state_func=lambda m, n, k, alpha, beta: choose_kernel_params(m, n, k, alpha, beta), + grid_func=lambda m, n, k, BLAS: (m // BLAS.c_dim[0], n // BLAS.c_dim[1]), + block_func=lambda m, n, k, BLAS: (BLAS.block_size, 1), + shared_mem_func=lambda state, kernel_args: shared_mem_func(state), + validation_func=validation_func, + repeats=repeats + ) + +def benchmark_unfused_emulated_dgemm(shapes, setup_func, igemm_func, epilogue_func, slices=7, repeats=10): + """ + Benchmark unfused emulated DGEMM. + + Unfused implementation: + - Computes all products (slices*(slices+1)//2) separately + - Stores products in M×N×num_products tensor + - Epilogue reconstructs the final result from products + + Args: + shapes: List of tuples (m, n, k, alpha, beta) + igemm_func: Function(d_A_sliced, d_B_sliced, d_products) + that computes all integer GEMM products + epilogue_func: Function(slices, d_products, d_shift_a, d_shift_b, d_C, alpha, beta) + that reconstructs the final result + slices: Number of slices (default: 7) + repeats: Number of timing iterations + """ + # Wrap igemm_func to ignore shift arguments + gemm_wrapper = lambda a, b, prod, shift_a, shift_b, alpha, beta, context, warmup=True: igemm_func(a, b, prod, context, warmup) + + return benchmark_emulated_dgemm( + shapes=shapes, + setup_func=setup_func, + gemm_func=gemm_wrapper, + epilogue_func=epilogue_func, + allocate_products=True, + allocate_diagonals=False, + num_products_func=lambda s: s * (s + 1) // 2, + slices=slices, + repeats=repeats + ) + + +def benchmark_partially_fused_emulated_dgemm(shapes, setup_func, igemm_func, epilogue_func, slices=7, repeats=10): + """ + Benchmark partially fused emulated DGEMM. + + Partially fused implementation: + - Computes and partially sums products into diagonal tensor + - Stores diagonals in M×N×slices tensor (one per slice combination diagonal) + - Epilogue reconstructs the final result from diagonals + + Args: + shapes: List of tuples (m, n, k, alpha, beta) + igemm_func: Function(d_A_sliced, d_B_sliced, d_diagonals) + that computes diagonal sums + epilogue_func: Function(slices, d_diagonals, d_shift_a, d_shift_b, d_C, alpha, beta) + that reconstructs the final result + slices: Number of slices (default: 7) + repeats: Number of timing iterations + """ + # Wrap igemm_func to ignore shift arguments + gemm_wrapper = lambda a, b, diag, shift_a, shift_b, alpha, beta, context, warmup=True: igemm_func(a, b, diag, context, warmup) + + return benchmark_emulated_dgemm( + shapes=shapes, + setup_func=setup_func, + gemm_func=gemm_wrapper, + epilogue_func=epilogue_func, + allocate_products=False, + allocate_diagonals=True, + num_products_func=lambda s: s, + slices=slices, + repeats=repeats + ) + + +def benchmark_fused_emulated_dgemm(shapes, setup_func, fused_kernel_func, slices=7, repeats=10): + """ + Benchmark fully fused emulated DGEMM. + + Fully fused implementation: + - Everything computed in a single kernel + - No intermediate storage (products/diagonals) + - Directly writes final result to d_C + + Args: + shapes: List of tuples (m, n, k, alpha, beta) + fused_kernel_func: Function(d_A_sliced, d_B_sliced, None, d_shift_a, d_shift_b) + that performs the entire operation + slices: Number of slices (default: 7) + repeats: Number of timing iterations + """ + return benchmark_emulated_dgemm( + shapes=shapes, + setup_func=setup_func, + gemm_func=fused_kernel_func, + epilogue_func=None, + allocate_products=False, + allocate_diagonals=False, + slices=slices, + repeats=repeats + ) + + +def benchmark_fused_emulated_dsyrk(shapes, setup_func, fused_kernel_func, slices=7, repeats=10): + """ + Benchmark fully fused emulated DSYRK (Symmetric Rank-K Update). + + SYRK computes: C = alpha * A @ A^T + beta * C where C is symmetric. + Only supports the fused variant (no intermediate products/diagonals). + Assumes fixed layout: row_major A, col_major C. + + Args: + shapes: List of tuples (n, k, alpha, beta, uplo) defining problem sizes + n: dimension of square output C (N×N) + k: inner dimension of A (N×K) + uplo: MatrixHalf.lower or MatrixHalf.upper (or 'L'/'U' strings for convenience) + setup_func: Function(n, k, matrix_half) -> context dict + fused_kernel_func: Function(d_A_sliced, d_C, d_shift_a, alpha, beta, context) + that performs the entire SYRK operation + slices: Number of slices (default: 7) + repeats: Number of timing iterations + """ + # Create cuBLAS handle once for all problem sizes + handle = cublas.create() + + try: + for shape in shapes: + n, k, alpha, beta, uplo = shape + + # Convert uplo string to MatrixHalf enum + if uplo == 'L' or uplo == MatrixHalf.lower: + matrix_half = MatrixHalf.lower + uplo_str = 'L' + elif uplo == 'U' or uplo == MatrixHalf.upper: + matrix_half = MatrixHalf.upper + uplo_str = 'U' + else: + raise ValueError(f"Invalid uplo: {uplo}. Must be 'L', 'U', MatrixHalf.lower, or MatrixHalf.upper") + + context = setup_func(n, k, matrix_half) + + block_size = 64 # Default block size for slicing kernels + + print(f"\nComputing Emulated SYRK N={n} K={k} (slices={slices}, uplo={uplo_str})\n") + + # Prepare data - fixed layout: row_major A, col_major C + h_A = random_real((n, k), np.float64, order='C') # row_major + h_C = random_real((n, n), np.float64, order='F') # col_major, square + h_C_original = h_C.copy() # Save original C for checking unchanged elements + + d_A_cp = cp.array(h_A) + d_C_cp = cp.array(h_C) + + d_A = cuda.as_cuda_array(d_A_cp) + d_C = cuda.as_cuda_array(d_C_cp) + + # Allocate sliced tensors with fixed strides + # A: row_major -> strides = (K, 1, N*K) + itemsize = np.dtype(np.int8).itemsize + strides_a = (k * itemsize, 1 * itemsize, n * k * itemsize) + + d_A_sliced_cp = cp.ndarray( + shape=(n, k, slices), + dtype=np.int8, + memptr=cp.cuda.alloc(n * k * slices * itemsize), + strides=strides_a + ) + d_A_sliced = cuda.as_cuda_array(d_A_sliced_cp) + + # Allocate shift tensors (only need one for A) + d_shift_a = cuda.device_array(n, dtype=np.int32) + + # Get slicing kernels (only need for A) + max_reduce_a = get_max_reduce_kernel(slice_matrix.a, BlockSize=block_size) + slice_kernel_a = get_slice_kernel(slice_matrix.a, Slices=slices, BlockSize=block_size) + + # Perform slicing + blocks_a = n + max_reduce_a[blocks_a, block_size](d_A, d_shift_a) + + num_elements_a = n * k + blocks_slice_a = (num_elements_a + block_size - 1) // block_size + + slice_kernel_a[blocks_slice_a, block_size](d_A, d_shift_a, d_A_sliced, k) + + # Run fused SYRK kernel + fused_kernel_func(d_A_sliced_cp, d_C_cp, d_shift_a, alpha, beta, context) + + cuda.synchronize() + + # Run reference using cuBLAS SYRK + d_CRef_cp = cp.array(h_C) + + # cuBLAS parameters for row-major A + # uplo: 0 = CUBLAS_FILL_MODE_LOWER, 1 = CUBLAS_FILL_MODE_UPPER + # trans: 1 = CUBLAS_OP_T (transpose) + # For row-major A (N×K): tell cuBLAS to transpose, so it computes A @ A^T + cublas_uplo = 0 if matrix_half == MatrixHalf.lower else 1 + cublas_trans = 1 # transpose (because A is row-major) + lda = k # leading dimension for row-major N×K is k (stride between rows) + ldc = n # leading dimension for col-major C + + # Create scalar pointers for alpha and beta (on host for cuBLAS) + alpha_host = np.array([alpha], dtype=np.float64) + beta_host = np.array([beta], dtype=np.float64) + alpha_ptr = alpha_host.ctypes.data + beta_ptr = beta_host.ctypes.data + + cublas.dsyrk( + handle, + cublas_uplo, + cublas_trans, + n, + k, + alpha_ptr, + d_A_cp.data.ptr, + lda, + beta_ptr, + d_CRef_cp.data.ptr, + ldc + ) + cp.cuda.Stream.null.synchronize() + + # Calculate accuracy (only for the relevant triangular part) + h_C = cp.asnumpy(d_C_cp) + h_CRef = cp.asnumpy(d_CRef_cp) + + # Create mask for the triangular part that should be updated + if matrix_half == MatrixHalf.lower: + # Lower triangle (including diagonal): i >= j + mask = np.tri(n, n, k=0, dtype=bool) + else: + # Upper triangle (including diagonal): i <= j + mask = np.tri(n, n, k=0, dtype=bool).T + + # Check 1: Accuracy metrics only for the triangular part + h_C_tri = h_C[mask] + h_CRef_tri = h_CRef[mask] + metrics = _calculate_accuracy_metrics(h_C_tri, h_CRef_tri) + + # Check 2: Verify that elements outside the triangular part haven't been modified + outside_mask = ~mask + h_C_outside = h_C[outside_mask] + h_C_orig_outside = h_C_original[outside_mask] + + max_outside_change = np.max(np.abs(h_C_outside - h_C_orig_outside)) + num_changed_outside = np.sum(np.abs(h_C_outside - h_C_orig_outside) > 1e-15) + + print(f"\nTriangular Check:") + print(f"Elements that should be unchanged: {np.sum(outside_mask)}") + print(f"Elements modified outside triangle: {num_changed_outside}") + print(f"Max change outside triangle: {max_outside_change:.5e}\n") + + # Time slicing separately + # Start with a warmup + max_reduce_a[blocks_a, block_size](d_A, d_shift_a) + slice_kernel_a[blocks_slice_a, block_size](d_A, d_shift_a, d_A_sliced, k) + + start = cp.cuda.Event() + end = cp.cuda.Event() + start.record() + for _ in range(repeats): + max_reduce_a[blocks_a, block_size](d_A, d_shift_a) + slice_kernel_a[blocks_slice_a, block_size](d_A, d_shift_a, d_A_sliced, k) + end.record() + end.synchronize() + slicing_time_ms = cp.cuda.get_elapsed_time(start, end) / repeats + + # Warmup fused kernel + fused_kernel_func(d_A_sliced_cp, d_C_cp, d_shift_a, alpha, beta, context) + + # Time fused kernel separately + start.record() + for _ in range(repeats): + fused_kernel_func(d_A_sliced_cp, d_C_cp, d_shift_a, alpha, beta, context, warmup=False) + end.record() + end.synchronize() + kernel_time_ms = cp.cuda.get_elapsed_time(start, end) / repeats + + # Calculate E2E time + e2e_time_ms = slicing_time_ms + kernel_time_ms + + # Print timing breakdown + print("Emulated SYRK Timing Breakdown") + print(f"Slicing time [ms] = {slicing_time_ms:.4f}") + print(f"Kernel time [ms] = {kernel_time_ms:.4f}") + print(f"E2E time [ms] = {e2e_time_ms:.4f}") + + # SYRK FLOPs: only computing triangular part (n*(n+1)/2 elements) + # Each element requires k multiply-adds, and each multiply-add is 2 FLOPs + syrk_flops = n * (n + 1) * k # (n*(n+1)/2) elements * k * 2 ops + e2e_tflops = (syrk_flops / 1e12) / (e2e_time_ms / 1000.0) + print(f"E2E TFLOP/s = {e2e_tflops:.4f}\n") + + # Time reference (using cuBLAS SYRK) + # Time reference separately using cuBLAS SYRK + d_CRef2_cp = cp.array(h_C) + alpha_host_ref = np.array([alpha], dtype=np.float64) + beta_host_ref = np.array([beta], dtype=np.float64) + alpha_ptr_ref = alpha_host_ref.ctypes.data + beta_ptr_ref = beta_host_ref.ctypes.data + + # Warmup + cublas.dsyrk(handle, cublas_uplo, cublas_trans, n, k, + alpha_ptr_ref, d_A_cp.data.ptr, lda, + beta_ptr_ref, d_CRef2_cp.data.ptr, ldc) + cp.cuda.Stream.null.synchronize() + + # Reset for timing + d_CRef2_cp = cp.array(h_C) + + # Time cuBLAS SYRK + start.record() + for _ in range(repeats): + cublas.dsyrk(handle, cublas_uplo, cublas_trans, n, k, + alpha_ptr_ref, d_A_cp.data.ptr, lda, + beta_ptr_ref, d_CRef2_cp.data.ptr, ldc) + end.record() + end.synchronize() + ref_time_ms = cp.cuda.get_elapsed_time(start, end) / repeats + + ref_tflops = (syrk_flops / 1e12) / (ref_time_ms / 1000.0) + _print_performance(ref_time_ms, ref_tflops, "cuBLAS SYRK reference") + + # Print accuracy information + _print_accuracy_metrics(metrics) + + # Calculate performance percentage + performance_pct = (ref_time_ms / e2e_time_ms) * 100.0 + print(f"{performance_pct:.2f}% reference performance ") + finally: + # Destroy handle after all problem sizes are done + cublas.destroy(handle) diff --git a/tutorials/floating-point-emulation/notebooks/utilities/python/common.py b/tutorials/floating-point-emulation/notebooks/utilities/python/common.py new file mode 100644 index 00000000..72f4e0d0 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/utilities/python/common.py @@ -0,0 +1,93 @@ +# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings import runtime as cudart +import numpy as np +import math + +from nvmath.device.common_cuda import ComputeCapability + + +def random_complex(shape, real_dtype, order="C") -> np.ndarray: + return random_real(shape, real_dtype, order) + 1.0j * random_real(shape, real_dtype, order) + + +def random_real(shape, real_dtype, order="C") -> np.ndarray: + # NOTE: reshape does not guarantee layout if order is provided. So we have + # to use copy. + return np.random.randn(np.prod(shape)).astype(real_dtype).reshape(shape).copy(order=order) + + +def random_int(shape, int_dtype, order="C"): + """ + Generate random integers in the range [-2, 2) for signed integers and [0, 4) + for unsigned integers. + """ + min_val, max_val = 0, 4 + if issubclass(int_dtype, np.signedinteger): + min_val, max_val = -2, 2 + # NOTE: reshape does not guarantee layout if order is provided. So we have + # to use copy. + return np.random.randint(min_val, max_val, size=shape, dtype=int_dtype).copy(order=order) + + +def random(shape, dtype, order=None, arrangement=None): + assert order is None or arrangement is None, "Specify only one of order or arrangement" + if arrangement is not None: + order = "C" if arrangement == "row_major" else "F" + if order is None: + order = "C" + if np.issubdtype(dtype, np.floating): + return random_real(shape, dtype, order) + elif np.issubdtype(dtype, np.complexfloating): + return random_complex(shape, dtype, order) + elif np.issubdtype(dtype, np.integer): + return random_int(shape, dtype, order) + + +def CHECK_CUDART(err): + if err != cudart.cudaError_t.cudaSuccess: + err2, str = cudart.cudaGetErrorString(cudart.cudaError_t.cudaSuccess) + raise RuntimeError(f"CUDArt Error: {str}") + + +def fft_perf_GFlops(fft_size, batch, time_ms, coef=1.0): + fft_flops_per_batch = coef * 5.0 * fft_size * math.log2(fft_size) + return batch * fft_flops_per_batch / (1e-3 * time_ms) / 1e9 + + +def mm_perf_GFlops(size, batch, time_ms, coef=1.0): + return coef * 2.0 * batch * size[0] * size[1] * size[2] / (1e-3 * time_ms) / 1e9 + + +def fp16x2_to_complex64(data): + return data[..., ::2] + 1.0j * data[..., 1::2] + + +def complex64_to_fp16x2(data): + shape = (*data.shape[:-1], data.shape[-1] * 2) + output = np.zeros(shape=shape, dtype=np.float16) + output[..., 0::2] = data.real + output[..., 1::2] = data.imag + return output + + +def device_shared_memory(cc: ComputeCapability) -> int: + # Source for these chip memory numbers: + # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications__technical-specifications-per-compute-capability + match cc.integer: + case 1200 | 1210: + return 99 * 1024 + case 900 | 1000 | 1010 | 1030 | 1100: + return 227 * 1024 + case 890 | 860: + return 99 * 1024 + case 800 | 870: + return 163 * 1024 + case 750: + return 64 * 1024 + case 700 | 720: + return 96 * 1024 + case _: + return 48 * 1024 diff --git a/tutorials/floating-point-emulation/notebooks/utilities/python/common_cuda.py b/tutorials/floating-point-emulation/notebooks/utilities/python/common_cuda.py new file mode 100644 index 00000000..f9233f4f --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/utilities/python/common_cuda.py @@ -0,0 +1,41 @@ +from numba import cuda # pip install numba + +import sys +import os + +def get_compute_capability(device_ordinal: int = 0) -> str: + # Check if CUDA is available + if not cuda.is_available(): + raise RuntimeError("CUDA is not available") + + # Get list of devices + devices = cuda.gpus.lst + + if device_ordinal < 0 or device_ordinal >= len(devices): + raise ValueError( + f"Invalid device ordinal {device_ordinal}; " + f"{len(devices)} device(s) available" + ) + + # Get device and compute capability + device = cuda.gpus[device_ordinal] + major, minor = device.compute_capability + + # Combine into a single integer, e.g. 8 and 9 -> 89 + compute_capability = major * 10 + minor + + return (str(compute_capability) + "a") if (compute_capability == 90 or compute_capability == 100) else str(compute_capability) + +def setup_local_arch(device_ordinal: int = 0): + os.environ['PATH'] = '/usr/local/cuda-13.1/bin:' + os.environ['PATH'] + local_arch = get_compute_capability(device_ordinal) + os.environ['LOCAL_ARCH'] = local_arch + + +def setup_cmake_project(device_ordinal: int = 0): + setup_local_arch(device_ordinal) + # Clean build directory first + os.system("rm -rf ./build/*") + # Configure CMake to use build/ for temporary files + os.system("echo \"Building for ${LOCAL_ARCH}\"") + os.system("cmake -B build -DTUTORIAL_CUDA_ARCHITECTURE=$LOCAL_ARCH") \ No newline at end of file diff --git a/tutorials/floating-point-emulation/notebooks/utilities/python/common_cupy.py b/tutorials/floating-point-emulation/notebooks/utilities/python/common_cupy.py new file mode 100644 index 00000000..06855106 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/utilities/python/common_cupy.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +import cupy + + +def time_cupy(fun, ncycles, *args): + start, stop = cupy.cuda.Event(), cupy.cuda.Event() + out = fun(*args) + + start.record(None) + for _ in range(ncycles): + out = fun(*args) # noqa: F841 + stop.record(None) + stop.synchronize() + + time_ms = cupy.cuda.get_elapsed_time(start, stop) / ncycles + + return time_ms \ No newline at end of file diff --git a/tutorials/floating-point-emulation/notebooks/utilities/python/common_numba.py b/tutorials/floating-point-emulation/notebooks/utilities/python/common_numba.py new file mode 100644 index 00000000..455cc25f --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/utilities/python/common_numba.py @@ -0,0 +1,185 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +import ctypes +from cuda.bindings import driver as cudadrv +from numba import cuda +from numba.cuda.typing import typeof as cuda_typeof +import math +from nvmath.device import float16x2 + +import _nvmath_fixup + +def time_numba(kernel, grid_dim, block_dim, shared_memory_size, ncycles, *args, get_results=None): + ## Numba + stream = cuda.stream() + start, stop = cuda.event(), cuda.event() + cuda.synchronize() + + # jit + set max dynamic smem size + set_max_dynamic_shared_size_bytes(kernel, shared_memory_size, *args) + + # warmup + kernel[grid_dim, block_dim, stream, shared_memory_size](*args) + stream.synchronize() + + if get_results is not None: + get_results() + + # time + start.record(stream) + for _ in range(ncycles): + kernel[grid_dim, block_dim, stream, shared_memory_size](*args) + stop.record(stream) + stream.synchronize() + + time_ms = cuda.event_elapsed_time(start, stop) / ncycles + return time_ms + + +def get_active_blocks_per_multiprocessor(kernel, block_dim, dynamic_smem_size, *args): + argsty = tuple([cuda_typeof.typeof(a) for a in args]) + compiled = kernel.compile(argsty) + ctx = cuda.current_context() + cufunc = compiled.library.get_cufunc() + active_per_sm = ctx.get_active_blocks_per_multiprocessor(cufunc, math.prod(block_dim), dynamic_smem_size) + + return active_per_sm + + +def set_max_dynamic_shared_size_bytes(kernel, max_dynamic_smem_size, *args): + argsty = tuple([cuda_typeof.typeof(a) for a in args]) + compiled = kernel.compile(argsty) + cufunc = compiled.library.get_cufunc() + # Starting in numba-cuda 0.15, there are two bindings backends, we need to handle + # both. See docs about NUMBA_CUDA_USE_NVIDIA_BINDING environment variable. + if isinstance(cufunc.handle, ctypes.c_void_p): + handle = cufunc.handle.value + elif isinstance(cufunc.handle, cudadrv.CUkernel): + resp, func = cudadrv.cuKernelGetFunction(cufunc.handle) + if resp != cudadrv.CUresult.CUDA_SUCCESS: + raise RuntimeError(f"cuKernelGetFunction failed with error code {resp}") + handle = func + elif isinstance(cufunc.handle, cudadrv.CUfunction): + handle = cufunc.handle + else: + raise RuntimeError(f"Unsupported cufunc.handle type: {type(cufunc.handle)}") + + resp = cudadrv.cuFuncSetAttribute( + handle, + cudadrv.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + max_dynamic_smem_size, + ) + if resp[0] != cudadrv.CUresult.CUDA_SUCCESS: + raise RuntimeError(f"cuFuncSetAttribute failed with error code {resp}") + + +# matrix is always in C-order (cupy/numpy) but smem should always be in F-order (expected by +# cuBLASDx) +@cuda.jit(device=True, forceinline=True) +def load_to_shared_batched(matrix, smem, batch, dim, ld, row_major=False): + start = cuda.threadIdx.x + step = cuda.blockDim.x + stop = dim[0] * dim[1] + for index in range(start, stop, step): + col = index % dim[1] + row = index // dim[1] + if row_major: + smem[batch * dim[1] * ld + row * ld + col] = matrix[batch, row, col] + else: + smem[batch * dim[1] * ld + col * ld + row] = matrix[batch, row, col] + + +@cuda.jit(device=True, forceinline=True) +def load_to_shared(matrix, smem, dim, ld, row_major=False): + start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y) + step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z + stop = dim[0] * dim[1] + for index in range(start, stop, step): + col = index % dim[1] + row = index // dim[1] + if row_major: + smem[row * ld + col] = matrix[row, col] + else: + smem[col * ld + row] = matrix[row, col] + + +@cuda.jit(device=True, forceinline=True) +def load_to_shared_2d(matrix, smem, dim, row_major=False): + start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y) + step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z + stop = dim[0] * dim[1] + for index in range(start, stop, step): + col = index % dim[1] + row = index // dim[1] + if row_major: + smem[row, col] = matrix[row, col] + else: + smem[col, row] = matrix[row, col] + + +@cuda.jit(device=True, forceinline=True) +def load_to_shared_1d_float16x2(matrix, smem, dim, ld, row_major=False): + start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y) + step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z + stop = dim[0] * dim[1] + for index in range(start, stop, step): + col = index % dim[1] + row = index // dim[1] + r = matrix[row, 2 * col + 0] + i = matrix[row, 2 * col + 1] + if row_major: + smem[row * ld + col] = float16x2(r, i) + else: + smem[col * ld + row] = float16x2(r, i) + + +@cuda.jit(device=True, forceinline=True) +def store_from_shared_batched(smem, matrix, batch, dim, ld): + start = cuda.threadIdx.x + step = cuda.blockDim.x + stop = dim[0] * dim[1] + for index in range(start, stop, step): + col = index % dim[1] + row = index // dim[1] + matrix[batch, row, col] = smem[batch * dim[1] * ld + col * ld + row] + + +@cuda.jit(device=True, forceinline=True) +def store_from_shared(smem, matrix, dim, ld, row_major=False): + start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y) + step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z + stop = dim[0] * dim[1] + for index in range(start, stop, step): + col = index % dim[1] + row = index // dim[1] + if row_major: + matrix[row, col] = smem[row * ld + col] + else: + matrix[row, col] = smem[col * ld + row] + + +@cuda.jit(device=True, forceinline=True) +def store_from_shared_2d(smem, matrix, dim): + start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y) + step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z + stop = dim[0] * dim[1] + for index in range(start, stop, step): + col = index % dim[1] + row = index // dim[1] + matrix[row, col] = smem[col, row] + + +@cuda.jit(device=True, forceinline=True) +def store_from_shared_1d_float16x2(smem, matrix, dim, ld): + start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y) + step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z + stop = dim[0] * dim[1] + for index in range(start, stop, step): + col = index % dim[1] + row = index // dim[1] + ri = smem[col * ld + row] + matrix[row, 2 * col + 0] = ri.x + matrix[row, 2 * col + 1] = ri.y + diff --git a/tutorials/floating-point-emulation/notebooks/utilities/python/emulation_slicing_impl.py b/tutorials/floating-point-emulation/notebooks/utilities/python/emulation_slicing_impl.py new file mode 100644 index 00000000..0a8e229c --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/utilities/python/emulation_slicing_impl.py @@ -0,0 +1,109 @@ +import numpy as np +import time +from enum import IntEnum + +from numba import cuda +from cuda import coop + +from emulation_utils import * + +class slice_matrix(IntEnum): + a = 0 + b = 1 + +def get_slice_kernel(SliceMatrix, Slices, BlockSize=64): + uint8_width = get_width(np.uint8) + int8_width = get_width(np.int8) + + @cuda.jit(device=True) + def slices_from_fp64(slices, val: np.float64, exponent_shift: np.int32): + normalization_factor = float(2**52) + + skip_slices = 0 + r = 0 + reg_pack = 0 + + r0_exponent = extract_exponent(val) + denorm_compensation = 0 + + if r0_exponent == 0: + if val == 0.0: + skip_slices = Slices + r = 0 + else: + # round to nearest is the default behaviour on CPU + val = (val * normalization_factor) + r0_exponent = extract_exponent(val) + denorm_compensation = -52 + + exp = r0_exponent + exponent_shift + denorm_compensation - BIAS + exp += (Slices - 1) * uint8_width # Use all 8 bits + + # Adjust casting range + extra_width = (exp + 1) - 63 + extra_width = extra_width if extra_width > 0 else 0 + skip_slices = div_up(extra_width, uint8_width) + exp -= skip_slices * uint8_width + + if exp < 0: + r = 0 + else: + val = copy_with_exponent(val, exp + BIAS) + r = np.int64(val) + + for _i in range(0, Slices): + i = Slices - 1 - _i + + if _i < skip_slices: + reg_pack = np.uint8(0) + else: + reg_pack = np.uint8(r) + slices[i] = np.int8(reg_pack) + r = np.int64((r >> uint8_width) + (reg_pack >> int8_width)) + + @cuda.jit(launch_bounds=(BlockSize, 2)) + def slice_kernel(in_tensor, shift_tensor, out_tensor, reduction_dim_size): + tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x + + row_idx = tid // reduction_dim_size + col_idx = tid % reduction_dim_size + + if SliceMatrix == slice_matrix.a: + shift_idx = row_idx + else: + shift_idx = col_idx + + slices = cuda.local.array(shape=(Slices,), dtype=np.int8) + slices_from_fp64(slices, in_tensor[row_idx, col_idx], shift_tensor[shift_idx]) + + for elem in range(0, Slices): + out_tensor[row_idx, col_idx, elem] = slices[elem] + + return slice_kernel + +def get_max_reduce_kernel(SliceMatrix, BlockSize=64): + + @cuda.jit(device=True) + def max_op(a, b): + return a if a > b else b + + block_reduce = coop.block.reduce(np.float64, BlockSize, max_op) + + @cuda.jit(launch_bounds=(BlockSize, 2), link=block_reduce.files) + def max_reduce_kernel(in_tensor, out_tensor): + tid = cuda.threadIdx.x + bid = cuda.blockIdx.x + + global_tile = in_tensor[bid, :] if SliceMatrix == slice_matrix.a else in_tensor[:, bid] + + local_max = 0 + for i in range(tid, len(global_tile), BlockSize): + local_max = max(local_max, abs(global_tile[i])) + + # TODO(cbrower) check if we can reduce with smem and how this reduction happens + local_max = block_reduce(local_max) + + if tid == 0: + out_tensor[bid] = max_to_exponent_shift(local_max) + + return max_reduce_kernel diff --git a/tutorials/floating-point-emulation/notebooks/utilities/python/emulation_utils.py b/tutorials/floating-point-emulation/notebooks/utilities/python/emulation_utils.py new file mode 100644 index 00000000..06a52c90 --- /dev/null +++ b/tutorials/floating-point-emulation/notebooks/utilities/python/emulation_utils.py @@ -0,0 +1,113 @@ +import ctypes +import math + +import numpy as np + +from numba import cuda +from enum import IntEnum + +BIAS = 1023 + +class MatrixHalf(IntEnum): + lower = 0 + upper = 1 + +@cuda.jit(device=True, forceinline=True) +def extract_sign(x:np.float64) -> np.int32: + SIGN_MASK = 0x8000000000000000 + return np.int32(x.view(np.int64) & SIGN_MASK) + +@cuda.jit(device=True, forceinline=True) +def copy_with_sign(x:np.float64, sign:np.int32) -> np.float64: + SIGN_MASK = 0x8000000000000000 + bits = (x.view(np.int64) & ~SIGN_MASK) | (np.int64(sign) << 63) + return bits.view(np.float64) + +@cuda.jit(device=True, forceinline=True) +def extract_exponent(x:np.float64) -> np.int32: + EXPONENT_MASK = 0x7FF0000000000000 + EXPONENT_SHIFT = 52 + return np.int32((x.view(np.int64) & EXPONENT_MASK) >> EXPONENT_SHIFT) + +@cuda.jit(device=True, forceinline=True) +def copy_with_exponent(x:np.float64, exponent:np.int32) -> np.float64: + EXPONENT_MASK = 0x7FF0000000000000 + EXPONENT_SHIFT = 52 + bits = (x.view(np.int64) & ~EXPONENT_MASK) | (np.int64(exponent) << EXPONENT_SHIFT) + return bits.view(np.float64) + +@cuda.jit(device=True, forceinline=True) +def extract_mantissa_hi(x:np.float64) -> np.int32: + MANTISSA_HI_MASK = 0x000FFFFF00000000 + return np.int32((x.view(np.int64) & MANTISSA_HI_MASK) >> 32) + +@cuda.jit(device=True, forceinline=True) +def copy_with_mantissa_hi(x:np.float64, mantissa_hi:np.int32) -> np.float64: + MANTISSA_HI_MASK = 0x000FFFFF00000000 + bits = (x.view(np.int64) & ~MANTISSA_HI_MASK) | (np.int64(mantissa_hi) << 32) + return bits.view(np.float64) + +@cuda.jit(device=True, forceinline=True) +def extract_mantissa_lo(x:np.float64) -> np.int32: + MANTISSA_LO_MASK = 0x00000000FFFFFFFF + return np.int32(x.view(np.int64) & MANTISSA_LO_MASK) + +@cuda.jit(device=True, forceinline=True) +def copy_with_mantissa_lo(x:np.float64, mantissa_lo:np.int32) -> np.float64: + MANTISSA_LO_MASK = 0x00000000FFFFFFFF + bits = (x.view(np.int64) & ~MANTISSA_LO_MASK) | (np.int64(mantissa_lo)) + return bits.view(np.float64) + +def get_width(dtype: np.dtype): + if dtype == np.int8: + return 7 + elif dtype == np.uint8: + return 8 + +@cuda.jit(device=True, forceinline=True) +def div_up(x, y): + return (x + y - 1) // y + +# Host version - works with dtype objects +def max_exponent(dtype: np.dtype): + if dtype == np.int8: + return 7 + elif dtype == np.uint8: + return 8 + +# Device version - uses compile-time constant +@cuda.jit(device=True, forceinline=True) +def max_exponent_int8(): + return 7 + +@cuda.jit(device=True, forceinline=True) +def max_exponent_uint8(): + return 8 + +@cuda.jit(device=True, forceinline=True) +def get_exponent(x: np.float64): + em_exponent = extract_exponent(x) + 1 - BIAS + + # Check if top 6 bits of mantissa_hi are set (63 << 14 = 0xFC000) + if extract_mantissa_hi(x) & (63 << 14) == (63 << 14): + em_exponent += 1 + + return em_exponent + +@cuda.jit(device=True, forceinline=True) +def max_to_exponent_shift(row_col_max: np.float64): + scale_max_exponent = max_exponent_int8() + + return scale_max_exponent - get_exponent(row_col_max) + +@cuda.jit(device=True, forceinline=True) +def epilogue_ldexp(em: np.float64, exp: int) -> np.float64: + exp_max = BIAS - 1 + previous_exp_biased = extract_exponent(em) + + if 0 < previous_exp_biased and 0 < previous_exp_biased + exp and previous_exp_biased + exp <= exp_max + BIAS: + em = copy_with_exponent(em, previous_exp_biased + exp) + return em + + # Use math.ldexp for cases outside fast path + return math.ldexp(em, exp)