diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..ce0280c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,32 @@ +minimum_pre_commit_version: 3.0.0 + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-added-large-files + args: ['--maxkb=1024'] + - id: check-merge-conflict + - id: check-symlinks + - id: detect-private-key + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: mixed-line-ending + args: ['--fix=lf'] # fix line endings to unix style + - id: check-case-conflict + - id: check-json + - id: trailing-whitespace + +- repo: https://github.com/executablebooks/mdformat + rev: 0.7.22 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-gfm + - mdformat-frontmatter + - mdformat-myst + - mdformat-tables + - mdformat-toc + - mdformat-black diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..c8be57a --- /dev/null +++ b/_config.yml @@ -0,0 +1,56 @@ +# Site settings +title: TileFusion +description: TileFusion is a highly efficient C++ macro kernel template library designed to elevate the level of abstraction in CUDA C for processing tiles. +baseurl: "/TileFusion" +url: "https://microsoft.github.io" +authors: + - name: "Ying Cao" + email: "lcy.seso@gmail.com" + github: "lcy-seso" + - name: "Chengxiang Qi" + email: "KuangjuX@outlook.com" + github: "KuangjuX" + +mathjax: + enable: true # MathJax equations, e.g. true, false (default) + combo: "tex-mml-chtml" + tags: "none" # "none", "ams" (default), "all" +google_fonts: + - name: "Source Sans Pro" + weights: "400,400i,700,700i" + - name: "Lora" + weights: "400,400i,700,700i" + +# Build settings +markdown: kramdown +kramdown: + math_engine: mathjax +theme: minima + +minima: + date_format: "%b %-d, %Y" + social_links: + - { platform: github, user_url: "https://github.com/microsoft/TileFusion" } + +plugins: + - jekyll-feed + - jekyll-seo-tag + +# Exclude from processing +exclude: + - 3rd-party + - .gitignore + +# Navigation +header_pages: + - index.md + - docs/installation.md + - docs/index.md + - examples/index.md + - benchmarks/index.md + - docs/about.md + +# Just the Docs configuration +aux_links: + "TileFusion on GitHub": + - "//github.com/microsoft/TileFusion" diff --git a/_includes/custom-head.html b/_includes/custom-head.html new file mode 100644 index 0000000..e7fdcd5 --- /dev/null +++ b/_includes/custom-head.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/_includes/footer.html b/_includes/footer.html new file mode 100644 index 0000000..f23f3e6 --- /dev/null +++ b/_includes/footer.html @@ -0,0 +1,37 @@ + diff --git a/_layouts/mathjax.html b/_layouts/mathjax.html new file mode 100644 index 0000000..ab89f74 --- /dev/null +++ b/_layouts/mathjax.html @@ -0,0 +1,19 @@ +--- +layout: default +--- + +{{ content }} + + + + diff --git a/assets/TileFusion-logo.png b/assets/TileFusion-logo.png new file mode 100644 index 0000000..bf868cb Binary files /dev/null and b/assets/TileFusion-logo.png differ diff --git a/assets/css/custom.css b/assets/css/custom.css new file mode 100644 index 0000000..26ad283 --- /dev/null +++ b/assets/css/custom.css @@ -0,0 +1,72 @@ +.site-footer { + background-color: #f8f9fa; + padding: 40px 0; + color: #333; + border-top: 1px solid #e9ecef; +} + +.footer-header { + margin-bottom: 30px; +} + +.footer-heading { + font-size: 24px; + margin-bottom: 15px; + color: #0078d4; /* Microsoft blue */ +} + +.footer-content { + display: flex; + flex-wrap: wrap; + justify-content: space-between; + margin-bottom: 30px; + width: 100%; /* Ensure full width */ +} + +.footer-col { + flex: 1; + min-width: 200px; + padding: 0 15px; + margin-bottom: 20px; +} + +.footer-col h3 { + font-size: 18px; + margin-bottom: 15px; + color: #0078d4; +} + +.footer-col ul { + list-style: none; + margin-left: 0; + padding-left: 0; +} + +.footer-col ul li { + margin-bottom: 8px; +} + +.footer-bottom { + clear: both; /* Ensure footer-bottom appears after all floating elements */ + width: 100%; /* Ensure full width */ + text-align: center; + border-top: 1px solid #e9ecef; + padding-top: 20px; + font-size: 14px; + color: #666; +} + +body { + font-family: 'Roboto', sans-serif; +} + +@media screen and (max-width: 600px) { + .footer-content { + flex-direction: column; + } + + .footer-col { + width: 100%; + padding: 0; + } +} diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 0000000..9ae3fff --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1,5 @@ +.vscode/* +build/* +__pycache__/ +.DS_Store +**/.DS_Store diff --git a/benchmarks/gemm.md b/benchmarks/gemm.md new file mode 100644 index 0000000..b1fa16b --- /dev/null +++ b/benchmarks/gemm.md @@ -0,0 +1,18 @@ +## Test Environment + +- **GPU**: NVIDIA Tesla A100 +- **CUDA Version**: 12.6 + +### Results + +| [M, N, K] | [kTM, kTN, kTK] | WarpLayout | kRK | CUTLASS(ms) | TileFusion(ms) | +| :----------------- | :-------------: | :--------: | :-: | :---------: | :------------: | +| [1024, 1024, 512] | [64, 128, 128] | [2, 2] | 16 | 0.017591 | 0.016548 | +| [1024, 1024, 1024] | [64, 128, 128] | [2, 2] | 16 | 0.029245 | 0.027156 | +| [2048, 2048, 1024] | [64, 128, 128] | [2, 2] | 16 | 0.065372 | 0.070431 | +| [2048, 2048, 2048] | [64, 128, 128] | [2, 2] | 16 | 0.101253 | 0.128143 | +| [4096, 4096, 4096] | [64, 128 128] | [2, 2] | 16 | 0.818606 | 0.969605 | +| [8192, 8192, 1024] | [64, 128 ,128] | [2, 2] | 16 | 0.871526 | 0.971059 | +| [8192, 8192, 2048] | [64, 128, 128] | [2, 2] | 16 | 1.937879 | 1.931223 | +| [8192, 8192, 4096] | [64, 128, 128] | [2, 2] | 16 | 3.924275 | 3.956757 | +| [8192, 8192, 8192] | [64, 128, 128] | [2, 2] | 16 | 7.740396 | 8.080589 | diff --git a/benchmarks/global_to_shared_copy.md b/benchmarks/global_to_shared_copy.md new file mode 100644 index 0000000..841f1e7 --- /dev/null +++ b/benchmarks/global_to_shared_copy.md @@ -0,0 +1,28 @@ +This preliminary test evaluates the performance of transferring a row-major data tile containing half-precision floating-point values between global memory and shared memory. The transfer process involves loading the data tile into shared memory and subsequently storing it back to global memory. This cycle is repeated 100 times to measure performance. + +Performance is assessed based on the total time required to complete the 100 data tile transfers. + +## Implementations + +The test includes implementations using TileFusion and cutlass, with no bank conflicts observed in the NVIDIA Compute Utility. The cutlass implementation utilizes a copy plan that allows for maximal global memory coalescing to optimally utilize the global memory. + +## Test Environment + +- **GPU**: NVIDIA Tesla A100 +- **CUDA Version**: 12.6 + +## Results + +| Shape | Warp Layout | tilefusion(ms) | cutlass(ms) | Ratio | +| :----------------- | :---------: | :------------: | :---------: | :----: | +| RowMajor(16, 64) | (1, 1) | 0.02996 | 0.02957 | 1.013 | +| RowMajor(64, 64) | (1, 1) | 0.05073 | 0.05071 | 1 | +| RowMajor(64, 64) | (2, 1) | 0.05045 | 0.05068 | 0.9956 | +| RowMajor(64, 64) | (4, 1) | 0.05119 | 0.05145 | 0.995 | +| RowMajor(128, 128) | (1, 1) | 0.1369 | 0.154 | 0.8888 | +| RowMajor(128, 128) | (2, 2) | 0.1374 | 0.134 | 1.025 | +| RowMajor(128, 128) | (4, 2) | 0.138 | 0.1382 | 0.9984 | +| RowMajor(128, 256) | (1, 1) | 0.2464 | 0.3694 | 0.6671 | +| RowMajor(128, 256) | (2, 2) | 0.2471 | 0.2458 | 1.005 | +| RowMajor(128, 256) | (2, 4) | 0.2592 | 0.2511 | 1.032 | +| RowMajor(128, 256) | (4, 4) | 0.2543 | 0.2572 | 0.9889 | diff --git a/benchmarks/index.md b/benchmarks/index.md new file mode 100644 index 0000000..27b965c --- /dev/null +++ b/benchmarks/index.md @@ -0,0 +1,13 @@ +--- +layout: page +title: Benchmarks +nav_order: 5 +has_children: true +--- + +This section contains performance benchmarks for TileFusion across various workloads. + +## Contents + +- [Data Transfer Between Global and Shared Memory](global_to_shared_copy.md) +- [GEMM Performance](gemm.md) diff --git a/docs/about.md b/docs/about.md new file mode 100644 index 0000000..dfbe618 --- /dev/null +++ b/docs/about.md @@ -0,0 +1,33 @@ +--- +layout: page +title: About +nav_order: 6 +has_children: false +--- + +This project is developed and maintained by the following authors: + +- [Ying Cao](https://github.com/lcy-seso) +- [Chengxiang Qi](https://github.com/KuangjuX) + +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit [https://cla.opensource.microsoft.com](https://cla.opensource.microsoft.com). + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). +Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..1fc9e67 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,12 @@ +--- +layout: page +title: Documentation +nav_order: 3 +has_children: true +--- + +Welcome to the TileFusion documentation. Here you'll find detailed information about the library's design documents, APIs, and usage patterns. + +## Contents + +- [Data Layout for Efficient Shared Memory Access](tiles_in_shared_memory.md) diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..7dd0387 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,58 @@ +--- +layout: page +title: Installation +nav_order: 2 +has_children: false +--- + +TileFusion requires a C++20 host compiler, CUDA 12.0 or later, and GCC version 10.0 or higher to support C++20 features. + +## Download + +```bash +git clone git@github.com:microsoft/TileFusion.git +cd TileFusion && git submodule update --init --recursive +``` + +## Build from Source + +### Building the C++ Library Using Makefile + +1. To build the project using the provided `Makefile`, simply run: + + ```bash + make + ``` + +2. Run the C++ unit tests: + + - **Run a single C++ unit test**: + ```bash + make unit_test_cpp CPP_UT=test_gemm + ``` + - **Run all C++ unit tests**: + ```bash + make unit_test_cpps + ``` + +### Building the Python Wrapper + +1. Build the wheel: + + ```bash + python3 setup.py build bdist_wheel + ``` + +2. Clean the build: + + ```bash + python3 setup.py clean + ``` + +3. Install the Python wrapper in editable mode (recommended for development): + + ```bash + python3 setup.py develop + ``` + + This allows you to edit the source code directly without needing to reinstall it repeatedly. diff --git a/docs/tiles_in_shared_memory.md b/docs/tiles_in_shared_memory.md new file mode 100644 index 0000000..bde997e --- /dev/null +++ b/docs/tiles_in_shared_memory.md @@ -0,0 +1,30 @@ +--- +layout: mathjax +title: Tiles in Shared Memory +--- + +## A Base Tile + +A `BaseTile` is a two-dimensional collection of data accessed cooperatively by threads within a single warp, with each thread issuing a single data access instruction. + +Let’s consider some specific examples. Suppose each thread accesses 128-bit data in a single access, and the threads are arranged within the warp in a row-major fashion, where threads along the rows have consecutive thread indices. + +If the data is in ***half-precision*** floating-point format: + +- When the threads in a warp are arranged in a $4 \times 8$ configuration, the `BaseTile` has dimensions of $4 \times 64$. +- When the threads in a warp are arranged in an $8 \times 4$ configuration, the `BaseTile` has dimensions of $8 \times 32$. +- When the threads in a warp are arranged in a $16 \times 2$ configuration, the `BaseTile` has dimensions of $16 \times 16$. + +Now, suppose the data is in ***single-precision*** floating-point format: + +- When the threads in a warp are arranged in a $4 \times 8$ configuration, the `BaseTile` has dimensions of $4 \times 32$. +- When the threads in a warp are arranged in an $8 \times 4$ configuration, the `BaseTile` has dimensions of $8 \times 16$. +- When the threads in a warp are arranged in a $16 \times 2$ configuration, the `BaseTile` has dimensions of $16 \times 8$. + +A keen observer may notice that the largest dimension of a `BaseTile` never exceeds 1024 bits. This is not coincidental; it is a result of several hardware parameters related to global and shared memory access. Global memory traffic is routed through the data caches (the L1 and/or L2 caches). An L1 cache line is 1024 bits, which also corresponds to the maximum memory transaction size. Additionally, shared memory consists of 32 banks, each with a width of 4 bytes, collectively amounting to 1024 bits. This alignment enhances the efficiency of data transfer between global and shared memory. + +## Storing Tiles in Shared Memory + +To ensure an efficient access pattern, we need to impose a constraint by assuming that each thread accesses 128-bit data, which is the maximum width of a vectorized access instruction. Consequently, the entire warp accesses $4 \times 128$ bytes of data. It is known that 128 bytes is the largest transaction size. When more than 128 bytes of data per warp are loaded or stored, the GPU does not issue a single transaction but divides the data access into four transactions. Furthermore, bank conflicts occur per transaction. + +Our objective is to avoid bank conflicts when loading data tiles from or storing data tiles to shared memory. diff --git a/index.md b/index.md new file mode 100644 index 0000000..3ffbb9b --- /dev/null +++ b/index.md @@ -0,0 +1,18 @@ +--- +layout: home +nav_order: 1 +--- + +
+ +
+ +# Simplifying Kernel Fusion with Tile Processing + +**TileFusion** is a highly efficient C++ macro kernel template library designed to elevate the level of abstraction in CUDA C for processing tiles. It is designed to be: + +- **Higher-Level Programming**: TileFusion offers a set of device kernels for transferring tiles between the CUDA device's three memory hierarchies and for computing tiles. +- **Modularity**: TileFusion enables users to construct their applications by processing larger tiles in time and space using the provided BaseTiles. +- **Efficiency**: TileFusion offers highly efficient implementations of these device kernels. + +TileFusion adopts a hardware bottom-up approach by building kernels around the core concept of the **BaseTile**. The shapes of these BaseTiles align with TensorCore's instruction shape and encapsulate hardware-dependent performance parameters to optimally utilize TensorCore's capabilities. Serving as building blocks, these BaseTiles are then combined to construct larger tiles in both temporal and spatial dimensions, enabling users to process larger tiles composed of BaseTiles for their applications.