From 2c2125f73e0c18b26340000cedcc78d75f9e707d Mon Sep 17 00:00:00 2001 From: Aleksander Dudek Date: Tue, 10 Feb 2026 12:50:42 +0000 Subject: [PATCH 1/2] ckTileEngine pooling --- test/ck_tile/CMakeLists.txt | 1 + .../pooling_tile_engine/CMakeLists.txt | 298 +++++++++ test/ck_tile/pooling_tile_engine/README.md | 88 +++ .../configs/simple_test_config.json | 60 ++ .../extract_test_params.py | 139 +++++ .../test_pooling_simple.cpp | 240 ++++++++ tile_engine/CMakeLists.txt | 1 + tile_engine/ops/pooling/CMakeLists.txt | 205 +++++++ .../ops/pooling/configs/default_config.json | 21 + tile_engine/ops/pooling/pooling_benchmark.hpp | 133 ++++ .../ops/pooling/pooling_benchmark_single.cpp | 204 ++++++ tile_engine/ops/pooling/pooling_common.hpp | 48 ++ .../ops/pooling/pooling_instance_builder.py | 579 ++++++++++++++++++ tile_engine/ops/pooling/pooling_profiler.hpp | 149 +++++ .../ops/pooling/pooling_validation_utils.py | 134 ++++ 15 files changed, 2300 insertions(+) create mode 100644 test/ck_tile/pooling_tile_engine/CMakeLists.txt create mode 100644 test/ck_tile/pooling_tile_engine/README.md create mode 100644 test/ck_tile/pooling_tile_engine/configs/simple_test_config.json create mode 100644 test/ck_tile/pooling_tile_engine/extract_test_params.py create mode 100644 test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp create mode 100644 tile_engine/ops/pooling/CMakeLists.txt create mode 100644 tile_engine/ops/pooling/configs/default_config.json create mode 100644 tile_engine/ops/pooling/pooling_benchmark.hpp create mode 100644 tile_engine/ops/pooling/pooling_benchmark_single.cpp create mode 100644 tile_engine/ops/pooling/pooling_common.hpp create mode 100644 tile_engine/ops/pooling/pooling_instance_builder.py create mode 100644 tile_engine/ops/pooling/pooling_profiler.hpp create mode 100644 tile_engine/ops/pooling/pooling_validation_utils.py diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index d9324119919..af03d2adfd0 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -42,3 +42,4 @@ add_subdirectory(gemm_tile_engine) add_subdirectory(pooling) add_subdirectory(grouped_conv) add_subdirectory(gemm_streamk_tile_engine) +add_subdirectory(pooling_tile_engine) diff --git a/test/ck_tile/pooling_tile_engine/CMakeLists.txt b/test/ck_tile/pooling_tile_engine/CMakeLists.txt new file mode 100644 index 00000000000..de238d144ee --- /dev/null +++ b/test/ck_tile/pooling_tile_engine/CMakeLists.txt @@ -0,0 +1,298 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +# ============================================================================ +# Pooling Tile Engine Unit Tests +# +# This CMake file creates unit tests for tile_engine generated pooling kernels. +# Each kernel configuration gets its own test executable. +# ============================================================================ + +# Locate tile_engine pooling scripts directory +set(TILE_ENGINE_POOLING_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/pooling") + +if(NOT EXISTS ${TILE_ENGINE_POOLING_DIR}) + message(WARNING "Tile engine pooling directory not found: ${TILE_ENGINE_POOLING_DIR}") + return() +endif() + +# ============================================================================ +# create_individual_pool_test_target +# +# Creates a single test executable for a specific pooling kernel configuration. +# +# Parameters: +# datatype - Data type (fp16, fp32, bf16) +# config_name - Configuration file name without .json extension +# trait - Kernel trait combination string +# tile_config - Tile configuration parameters +# config_json - Full path to JSON configuration file +# ============================================================================ +function(create_individual_pool_test_target datatype config_name trait tile_config config_json) + set(target_name "test_pooling_tile_engine_${datatype}_${config_name}_${trait}_${tile_config}") + set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${config_name}") + + # Generated header path (already created during cmake configuration) + set(test_header "${working_path}/pooling_single_pool_${datatype}_${trait}_${tile_config}.hpp") + set(test_params_header "${working_path}/test_params.hpp") + + # Verify header exists + if(NOT EXISTS ${test_header}) + message(WARNING "Generated header not found: ${test_header}") + return() + endif() + + # Verify test parameters header exists + if(NOT EXISTS ${test_params_header}) + message(WARNING "Test parameters header not found: ${test_params_header}") + return() + endif() + + # Create GTest executable for this kernel configuration + add_gtest_executable(${target_name} + ${CMAKE_CURRENT_SOURCE_DIR}/test_pooling_simple.cpp + ) + + # Configure GPU architectures for HIP compilation + set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${POOLING_TEST_GPU_TARGETS}) + + # Define preprocessor macros for generated header location and test parameters + target_compile_definitions(${target_name} PRIVATE + POOLING_SINGLE_INSTANCE_HPP="${test_header}" + POOLING_TEST_PARAMS_HPP="${test_params_header}" + ) + + # Include directories for headers and dependencies + target_include_directories(${target_name} PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_BINARY_DIR}/include + ${PROJECT_SOURCE_DIR} # Root directory for tile_engine access + ${GTEST_INCLUDE_DIRS} + ) + + # Compiler options matching tile_engine requirements + target_compile_options(${target_name} PRIVATE + -Wno-undefined-func-template + -Wno-float-equal + --offload-compress + -include ${test_header} + ) + + # Add FP8 format definitions for proper data type interpretation + if(CK_USE_OCP_FP8) + target_compile_options(${target_name} PRIVATE -DCK_TILE_USE_OCP_FP8) + endif() + + message(STATUS " Created test target: ${target_name}") +endfunction() + +# ============================================================================ +# build_pool_test_targets +# +# Builds all test targets for a specific datatype/config combination. +# Uses tile_engine's two-step process: list kernels, then generate tests. +# +# Parameters: +# datatype - Data type (fp16, fp32, bf16) +# config_name - Configuration file name without .json extension +# ============================================================================ +function(build_pool_test_targets datatype config_name) + set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${config_name}") + + # Locate and validate configuration file + set(config_filename "${config_name}.json") + set(json_blob "${CMAKE_CURRENT_SOURCE_DIR}/configs/${config_filename}") + + if(NOT EXISTS ${json_blob}) + message(WARNING "Test config file not found: ${json_blob}") + return() + endif() + + # Prepare build directory for this configuration + file(MAKE_DIRECTORY ${working_path}) + + # STEP 1: Discovery phase - list all valid kernel configurations + execute_process( + COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_POOLING_DIR}/pooling_instance_builder.py + --working_path ${working_path} + --datatype ${datatype} + --config_json ${json_blob} + --list_kernels + WORKING_DIRECTORY ${TILE_ENGINE_POOLING_DIR} + RESULT_VARIABLE ret + OUTPUT_VARIABLE list_output + ERROR_VARIABLE list_error + ) + + if(NOT ret EQUAL 0) + message(WARNING "Failed to list pooling kernels for ${datatype}_${config_name}: ${list_error}") + return() + endif() + + # Verify kernel list file was generated + if(NOT EXISTS ${working_path}/pool_kernel_list.txt) + message(STATUS "No pooling kernels found for ${datatype}_${config_name}") + return() + endif() + + message(STATUS "Building pooling tests for ${datatype}_${config_name}") + + # STEP 2a: Determine pooling dimension from config + # Read the trait config to find pooling_dim + file(READ ${json_blob} config_content) + string(FIND "${config_content}" "\"3d\"" found_3d) + if(found_3d GREATER -1) + set(pooling_dim "3d") + else() + set(pooling_dim "2d") + endif() + + # STEP 2b: Extract test parameters from config + set(test_params_file "${working_path}/test_params.hpp") + execute_process( + COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_SOURCE_DIR}/extract_test_params.py + --config_file ${json_blob} + --output_file ${test_params_file} + --pooling_dim ${pooling_dim} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE extract_ret + OUTPUT_VARIABLE extract_output + ERROR_VARIABLE extract_error + ) + + if(NOT extract_ret EQUAL 0) + message(WARNING "Failed to extract test parameters for pooling ${datatype}: ${extract_error}") + return() + endif() + + # STEP 2c: Header generation phase - generate headers using --gen_single + message(STATUS " Generating pooling headers using --gen_single...") + + file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines) + set(gen_count 0) + + foreach(line IN LISTS kernel_lines) + # Parse kernel specification format: kernel_name|tile_config|trait_combo + string(REPLACE "|" ";" parts "${line}") + list(LENGTH parts parts_len) + if(parts_len EQUAL 3) + list(GET parts 0 kernel_name) + list(GET parts 1 tile_config) + list(GET parts 2 trait_combo) + + # Generate header using --gen_single + execute_process( + COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_POOLING_DIR}/pooling_instance_builder.py + --working_path ${working_path} + --datatype ${datatype} + --config_json ${json_blob} + --gen_single + --kernel_name "${kernel_name}" + --tile_config "${tile_config}" + --trait_combo "${trait_combo}" + WORKING_DIRECTORY ${TILE_ENGINE_POOLING_DIR} + RESULT_VARIABLE gen_ret + OUTPUT_VARIABLE gen_output + ERROR_VARIABLE gen_error + ) + + if(NOT gen_ret EQUAL 0) + message(WARNING "Failed to generate pooling header for ${kernel_name}: ${gen_error}") + else() + math(EXPR gen_count "${gen_count} + 1") + endif() + endif() + endforeach() + + message(STATUS " Generated ${gen_count} pooling headers for ${datatype}") + + # STEP 3: Target creation phase - create test targets + message(STATUS " Creating pooling test targets...") + file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines) + set(test_count 0) + foreach(line IN LISTS kernel_lines) + string(REPLACE "|" ";" parts "${line}") + list(LENGTH parts parts_len) + if(parts_len EQUAL 3) + list(GET parts 0 kernel_name) + list(GET parts 1 tile_config) + list(GET parts 2 trait_combo) + + create_individual_pool_test_target("${datatype}" "${config_name}" "${trait_combo}" "${tile_config}" "${json_blob}") + math(EXPR test_count "${test_count} + 1") + endif() + endforeach() + message(STATUS " Created ${test_count} pooling test targets for ${datatype}") +endfunction() + +# ============================================================================ +# MAIN EXECUTION - Test Target Generation +# ============================================================================ + +message(STATUS "=== Starting Pooling Tile Engine Test Configuration ===") +message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}") + +# GPU architecture filtering - only build tests for supported architectures +set(POOLING_TEST_GPU_TARGETS "") +set(DESIRED_TARGETS "gfx90a;gfx942") + +foreach(target IN LISTS SUPPORTED_GPU_TARGETS) + if(target IN_LIST DESIRED_TARGETS) + list(APPEND POOLING_TEST_GPU_TARGETS ${target}) + message(STATUS " Adding GPU target for pooling tests: ${target}") + endif() +endforeach() + +# Early exit if no compatible GPU architectures are available +if(NOT POOLING_TEST_GPU_TARGETS) + message(WARNING "Skipping Pooling Tile Engine tests: No supported GPU targets (gfx90a, gfx942) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}") + return() +endif() + +message(STATUS "Building Pooling tile engine tests for GPU targets: ${POOLING_TEST_GPU_TARGETS}") + +# Enable parallel compilation optimizations +set_property(GLOBAL PROPERTY JOB_POOLS + compile_heavy=4 + compile_normal=16 +) + +# Enable compiler cache if available and explicitly requested +option(ENABLE_CCACHE_TESTS "Enable ccache for test compilation" OFF) +if(ENABLE_CCACHE_TESTS) + find_program(CCACHE_PROGRAM ccache) + if(CCACHE_PROGRAM) + set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) + message(STATUS "Using ccache for faster test compilation") + else() + message(WARNING "ccache requested but not found") + endif() +else() + message(STATUS "ccache disabled for tests (use -DENABLE_CCACHE_TESTS=ON to enable)") +endif() + +# ============================================================================ +# Test Configuration Matrix +# ============================================================================ + +set(TEST_DATATYPES "fp16;fp32") + +# ============================================================================ +# Test Target Generation +# ============================================================================ + +# SIMPLE TEST: Basic functionality validation +set(SIMPLE_TEST_CONFIG "simple_test_config") +set(SIMPLE_TEST_CONFIG_FILE "${CMAKE_CURRENT_SOURCE_DIR}/configs/${SIMPLE_TEST_CONFIG}.json") + +if(EXISTS ${SIMPLE_TEST_CONFIG_FILE}) + message(STATUS "Processing pooling simple test config: ${SIMPLE_TEST_CONFIG}") + foreach(datatype IN LISTS TEST_DATATYPES) + build_pool_test_targets("${datatype}" "${SIMPLE_TEST_CONFIG}") + endforeach() +else() + message(WARNING "Pooling simple test config file not found: ${SIMPLE_TEST_CONFIG_FILE}") +endif() + +message(STATUS "Pooling tile engine tests configured:") +message(STATUS " - Simple test: fp16/fp32") diff --git a/test/ck_tile/pooling_tile_engine/README.md b/test/ck_tile/pooling_tile_engine/README.md new file mode 100644 index 00000000000..d6c5984a131 --- /dev/null +++ b/test/ck_tile/pooling_tile_engine/README.md @@ -0,0 +1,88 @@ +# Pooling Tile Engine Tests + +Unit tests for pooling kernels generated by the tile_engine pooling codegen system. + +## Overview + +These tests validate pooling kernels that are generated at CMake configuration time +by `pooling_instance_builder.py`. Each kernel configuration (tile shape + traits) +gets its own GTest executable that verifies correctness against a CPU reference +implementation. + +## Architecture + + +``` +test/ck_tile/pooling_tile_engine/ +├── CMakeLists.txt # Build infrastructure +├── configs/ +│ └── simple_test_config.json # Test configuration with problem sizes +├── extract_test_params.py # Extracts problem sizes to C++ header +├── test_pooling_simple.cpp # GTest driver (parameterized) +└── README.md # This file +``` + +### Build Flow + +1. **CMake configuration**: `CMakeLists.txt` invokes `pooling_instance_builder.py --list_kernels` + to discover valid kernel configurations from the JSON config. +2. **Parameter extraction**: `extract_test_params.py` generates `test_params.hpp` with + problem sizes from the JSON config. +3. **Header generation**: For each kernel, `pooling_instance_builder.py --gen_single` + generates a C++ header defining `SelectedKernel` with the specific tile configuration. +4. **Compilation**: Each kernel gets a separate test executable compiled with the + generated header via `-include`. +5. **Execution**: GTest runs each problem size as a separate test case, comparing + device results against the CPU reference. + +## Configuration + +### `simple_test_config.json` + +Defines: +- **tile_config**: Block/warp/thread tile dimensions for PoolShape +- **trait_config**: Reduce op (max/avg), output_index, propagate_nan, pooling_dim (2d/3d) +- **test_params**: Problem sizes (N, H, W, C, window, stride, dilation, padding) + +### Supported configurations + +- **Data types**: fp16, fp32 +- **Reduce operations**: max (with index output) +- **Pooling dimensions**: 2D (NHWC), 3D (NDHWC) +- **GPU targets**: gfx90a, gfx942 + +## Building + +```bash +# From the build directory: +cmake --build . --target test_pooling_tile_engine_fp16_simple_test_config_max_true_false_2d_128x1_1x1_128x1_2x1 + +# Or build all pooling tests: +cmake --build . --target tests +``` + +## Running + +```bash +# Run a specific test: +./test_pooling_tile_engine_fp16_simple_test_config_max_true_false_2d_128x1_1x1_128x1_2x1 + +# Run with GTest filters: +./test_pooling_tile_engine_fp16_simple_test_config_max_true_false_2d_128x1_1x1_128x1_2x1 --gtest_filter="*BasicFunctionality*" +``` + +## Relationship to tile_engine + +The tile_engine pooling op lives at `tile_engine/ops/pooling/` and provides: +- `pooling_instance_builder.py` - Codegen for kernel headers +- `pooling_validation_utils.py` - Configuration validation +- `pooling_common.hpp` - Shared trait definitions +- `pooling_benchmark.hpp` - Problem/metric definitions +- `pooling_profiler.hpp` - Benchmark profiling +- `pooling_benchmark_single.cpp` - Single-kernel benchmark entry point + +The underlying ck_tile pooling kernel lives at `include/ck_tile/ops/pooling/` and provides: +- `PoolKernel` - GPU kernel implementation +- `PoolProblem` - Problem parameterization +- `PoolShape` - Tile shape specification +- `PoolDefaultPolicy` - Tile distribution and reduction policies diff --git a/test/ck_tile/pooling_tile_engine/configs/simple_test_config.json b/test/ck_tile/pooling_tile_engine/configs/simple_test_config.json new file mode 100644 index 00000000000..72b67745792 --- /dev/null +++ b/test/ck_tile/pooling_tile_engine/configs/simple_test_config.json @@ -0,0 +1,60 @@ +{ + "problem": { + "description": "Basic pooling functionality validation with moderate problem sizes" + }, + "test_params": { + "problem_sizes_2d": [ + { + "N": 1, "H": 8, "W": 8, "C": 32, + "Y": 2, "X": 2, + "stride_h": 2, "stride_w": 2, + "dilation_h": 1, "dilation_w": 1, + "pad_h_left": 0, "pad_h_right": 0, + "pad_w_left": 0, "pad_w_right": 0 + }, + { + "N": 2, "H": 16, "W": 16, "C": 32, + "Y": 3, "X": 3, + "stride_h": 2, "stride_w": 2, + "dilation_h": 1, "dilation_w": 1, + "pad_h_left": 1, "pad_h_right": 1, + "pad_w_left": 1, "pad_w_right": 1 + }, + { + "N": 1, "H": 32, "W": 32, "C": 64, + "Y": 2, "X": 2, + "stride_h": 2, "stride_w": 2, + "dilation_h": 1, "dilation_w": 1, + "pad_h_left": 0, "pad_h_right": 0, + "pad_w_left": 0, "pad_w_right": 0 + } + ], + "problem_sizes_3d": [ + { + "N": 1, "D": 4, "H": 4, "W": 4, "C": 32, + "Z": 2, "Y": 2, "X": 2, + "stride_d": 2, "stride_h": 2, "stride_w": 2, + "dilation_d": 1, "dilation_h": 1, "dilation_w": 1, + "pad_d_left": 0, "pad_d_right": 0, + "pad_h_left": 0, "pad_h_right": 0, + "pad_w_left": 0, "pad_w_right": 0 + } + ] + }, + "tile_config": { + "block_m": {"values": [128]}, + "block_n": {"values": [1]}, + "warp_m": {"values": [1]}, + "warp_n": {"values": [1]}, + "warp_tile_m": {"values": [128]}, + "warp_tile_n": {"values": [1]}, + "thread_tile_m": {"values": [2]}, + "thread_tile_n": {"values": [1]} + }, + "trait_config": { + "reduce_op": {"values": ["max"]}, + "output_index": {"values": [true]}, + "propagate_nan": {"values": [false]}, + "pooling_dim": {"values": ["2d"]} + } +} diff --git a/test/ck_tile/pooling_tile_engine/extract_test_params.py b/test/ck_tile/pooling_tile_engine/extract_test_params.py new file mode 100644 index 00000000000..dc93e55ee9c --- /dev/null +++ b/test/ck_tile/pooling_tile_engine/extract_test_params.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +""" +Extract pooling test parameters from config JSON and write to C++ header. +Generates test_params.hpp with problem sizes for parameterized GTest. +""" + +import json +import argparse +import os +from pathlib import Path + + +def extract_test_params(config_file, output_file, pooling_dim="2d"): + """Extract test parameters from config JSON and write to output file""" + + with open(config_file, "r") as f: + config = json.load(f) + + # Extract test parameters based on pooling dimension + test_params = [] + if pooling_dim == "2d": + if "test_params" in config and "problem_sizes_2d" in config["test_params"]: + test_params = config["test_params"]["problem_sizes_2d"] + else: + # Default 2D test parameters + test_params = [ + { + "N": 1, "H": 8, "W": 8, "C": 32, + "Y": 2, "X": 2, + "stride_h": 2, "stride_w": 2, + "dilation_h": 1, "dilation_w": 1, + "pad_h_left": 0, "pad_h_right": 0, + "pad_w_left": 0, "pad_w_right": 0, + }, + { + "N": 2, "H": 16, "W": 16, "C": 32, + "Y": 3, "X": 3, + "stride_h": 2, "stride_w": 2, + "dilation_h": 1, "dilation_w": 1, + "pad_h_left": 1, "pad_h_right": 1, + "pad_w_left": 1, "pad_w_right": 1, + }, + ] + else: # 3d + if "test_params" in config and "problem_sizes_3d" in config["test_params"]: + test_params = config["test_params"]["problem_sizes_3d"] + else: + # Default 3D test parameters + test_params = [ + { + "N": 1, "D": 4, "H": 4, "W": 4, "C": 32, + "Z": 2, "Y": 2, "X": 2, + "stride_d": 2, "stride_h": 2, "stride_w": 2, + "dilation_d": 1, "dilation_h": 1, "dilation_w": 1, + "pad_d_left": 0, "pad_d_right": 0, + "pad_h_left": 0, "pad_h_right": 0, + "pad_w_left": 0, "pad_w_right": 0, + }, + ] + + # Write to output file in C++ format + output_dir = Path(output_file).parent + output_dir.mkdir(parents=True, exist_ok=True) + + with open(output_file, "w") as f: + f.write("// Generated test parameters for pooling tile_engine tests\n") + f.write("// This file is auto-generated during CMake configuration\n\n") + + if pooling_dim == "2d": + f.write( + "static const std::vector CONFIG_TEST_PARAMS = {\n" + ) + for i, params in enumerate(test_params): + comma = "," if i < len(test_params) - 1 else "" + f.write( + f" {{" + f"{params['N']}, {params['H']}, {params['W']}, {params['C']}, " + f"{params['Y']}, {params['X']}, " + f"{params['stride_h']}, {params['stride_w']}, " + f"{params['dilation_h']}, {params['dilation_w']}, " + f"{params['pad_h_left']}, {params['pad_h_right']}, " + f"{params['pad_w_left']}, {params['pad_w_right']}" + f"}}{comma}\n" + ) + f.write("};\n") + else: # 3d + f.write( + "static const std::vector CONFIG_TEST_PARAMS = {\n" + ) + for i, params in enumerate(test_params): + comma = "," if i < len(test_params) - 1 else "" + f.write( + f" {{" + f"{params['N']}, {params['D']}, {params['H']}, {params['W']}, {params['C']}, " + f"{params['Z']}, {params['Y']}, {params['X']}, " + f"{params['stride_d']}, {params['stride_h']}, {params['stride_w']}, " + f"{params['dilation_d']}, {params['dilation_h']}, {params['dilation_w']}, " + f"{params['pad_d_left']}, {params['pad_d_right']}, " + f"{params['pad_h_left']}, {params['pad_h_right']}, " + f"{params['pad_w_left']}, {params['pad_w_right']}" + f"}}{comma}\n" + ) + f.write("};\n") + + print( + f"Extracted {len(test_params)} {pooling_dim} test parameters from {config_file} -> {output_file}" + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Extract pooling test parameters from config JSON" + ) + parser.add_argument("--config_file", required=True, help="Input config JSON file") + parser.add_argument( + "--output_file", required=True, help="Output test parameters file" + ) + parser.add_argument( + "--pooling_dim", + default="2d", + choices=["2d", "3d"], + help="Pooling dimension (2d or 3d)", + ) + + args = parser.parse_args() + + if not os.path.exists(args.config_file): + print(f"Error: Config file not found: {args.config_file}") + return 1 + + extract_test_params(args.config_file, args.output_file, args.pooling_dim) + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp b/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp new file mode 100644 index 00000000000..34ad3d79bb9 --- /dev/null +++ b/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp @@ -0,0 +1,240 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +/** + * @file test_pooling_simple.cpp + * @brief Unit tests for pooling kernels generated by pooling_instance_builder + * + * This test includes kernels generated during CMake configuration by + * pooling_instance_builder.py and tests them with problem sizes extracted + * from the corresponding JSON configuration files. + */ + +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/pooling.hpp" +#include "ck_tile/host/reference/reference_pool.hpp" +#include "tile_engine/ops/pooling/pooling_common.hpp" + +// The kernel header is included via compile command line with -include flag +// It defines: SelectedKernel, KERNEL_NAME, InDataType, OutDataType, +// ComputeDataType, IndexDataType, ReduceOpType, +// TensorShape, WindowShape, POOLING_DIM + +// ============================================================================ +// Test parameter structures +// ============================================================================ + +/// @brief Test parameters for 2D pooling +struct PoolTestParams2D +{ + int N, H, W, C; // Input dimensions (NHWC) + int Y, X; // Window size + int stride_h, stride_w; // Strides + int dilation_h, dilation_w; // Dilations + int pad_h_left, pad_h_right; // Height padding + int pad_w_left, pad_w_right; // Width padding +}; + +/// @brief Test parameters for 3D pooling +struct PoolTestParams3D +{ + int N, D, H, W, C; // Input dimensions (NDHWC) + int Z, Y, X; // Window size + int stride_d, stride_h, stride_w; // Strides + int dilation_d, dilation_h, dilation_w; // Dilations + int pad_d_left, pad_d_right; // Depth padding + int pad_h_left, pad_h_right; // Height padding + int pad_w_left, pad_w_right; // Width padding +}; + +// Include config-specific test parameters (after parameter structs are defined) +#ifdef POOLING_TEST_PARAMS_HPP +#include POOLING_TEST_PARAMS_HPP +#endif + +// ============================================================================ +// 2D Pooling Tests +// ============================================================================ + +class PoolingTileEngineTest2D : public ::testing::TestWithParam +{ + protected: + void SetUp() override + { + auto params = GetParam(); + N_ = params.N; + H_ = params.H; + W_ = params.W; + C_ = params.C; + Y_ = params.Y; + X_ = params.X; + stride_h_ = params.stride_h; + stride_w_ = params.stride_w; + dilation_h_ = params.dilation_h; + dilation_w_ = params.dilation_w; + pad_h_left_ = params.pad_h_left; + pad_h_right_ = params.pad_h_right; + pad_w_left_ = params.pad_w_left; + pad_w_right_ = params.pad_w_right; + + // Calculate output dimensions + ck_tile::index_t Ys = (Y_ - 1) * dilation_h_ + 1; + ck_tile::index_t Xs = (X_ - 1) * dilation_w_ + 1; + Ho_ = (H_ + pad_h_left_ + pad_h_right_ - Ys) / stride_h_ + 1; + Wo_ = (W_ + pad_w_left_ + pad_w_right_ - Xs) / stride_w_ + 1; + } + + int N_, H_, W_, C_; + int Y_, X_; + int stride_h_, stride_w_; + int dilation_h_, dilation_w_; + int pad_h_left_, pad_h_right_; + int pad_w_left_, pad_w_right_; + int Ho_, Wo_; +}; + +TEST_P(PoolingTileEngineTest2D, BasicFunctionality) +{ + // Create host tensors + ck_tile::HostTensor h_in({N_, H_, W_, C_}); + ck_tile::HostTensor h_out({N_, Ho_, Wo_, C_}); + ck_tile::HostTensor h_out_ref({N_, Ho_, Wo_, C_}); + ck_tile::HostTensor h_out_index({N_, Ho_, Wo_, C_}); + ck_tile::HostTensor h_out_ref_index({N_, Ho_, Wo_, C_}); + + // Initialize input with random data + ck_tile::FillUniformDistribution{-5.f, 5.f}(h_in); + h_out.SetZero(); + h_out_ref.SetZero(); + + // Device memory + ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes()); + ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes()); + ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes()); + + d_in.ToDevice(h_in.data()); + d_out.SetZero(); + d_out_index.SetZero(); + + // Build shapes and strides (NHWC layout) + const auto input_shape = ck_tile::make_tuple(N_, H_, W_, C_); + const auto output_shape = ck_tile::make_tuple(N_, Ho_, Wo_, C_); + const auto input_strides = ck_tile::make_tuple(H_ * W_ * C_, W_ * C_, C_, 1); + const auto output_strides = ck_tile::make_tuple(Ho_ * Wo_ * C_, Wo_ * C_, C_, 1); + const auto window_lengths = ck_tile::make_tuple(Y_, X_); + const auto window_strides = ck_tile::make_tuple(stride_h_, stride_w_); + const auto window_dilations = ck_tile::make_tuple(dilation_h_, dilation_w_); + const auto input_left_pads = ck_tile::make_tuple(pad_h_left_, pad_w_left_); + const auto input_right_pads = ck_tile::make_tuple(pad_h_right_, pad_w_right_); + + // Build host args for the generated kernel + auto host_args = + ck_tile::PoolHostArgs{ + d_in.GetDeviceBuffer(), + d_out.GetDeviceBuffer(), + d_out_index.GetDeviceBuffer(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; + + // Stream config: no timing overhead for fastest execution + ck_tile::stream_config stream_config{nullptr, false, 0, 0, 1, false, false, 1}; + + // Launch generated kernel + try + { + SelectedKernel::launch(host_args, stream_config); + } + catch(const std::exception& e) + { + std::string error_msg(e.what()); + if(error_msg.find("Arguments not supported") != std::string::npos) + { + GTEST_SKIP() << "Configuration not supported: " << e.what(); + } + else + { + FAIL() << "Kernel launch failed: " << e.what(); + } + } + + // Copy results back + d_out.FromDevice(h_out.data()); + d_out_index.FromDevice(h_out_index.data()); + + // Compute reference on host + auto kernel_args_ref = + ck_tile::PoolKernelArgs{ + h_in.data(), + h_out_ref.data(), + h_out_ref_index.data(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; + + ck_tile::reference_pool2d( + h_in, h_out_ref, h_out_ref_index, kernel_args_ref, ReduceOpType{}); + + // Verify value results + bool pass_value = + ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-5, 1e-5); + EXPECT_TRUE(pass_value) << "Pooling value verification failed for " << KERNEL_NAME; + + // Verify index results if output_index is enabled + if constexpr(SelectedKernel::kOutputIndex) + { + bool pass_index = ck_tile::check_err( + h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0); + EXPECT_TRUE(pass_index) << "Pooling index verification failed for " << KERNEL_NAME; + } +} + +TEST_P(PoolingTileEngineTest2D, KernelInfo) +{ + EXPECT_TRUE(strlen(KERNEL_NAME) > 0) << "Kernel name should not be empty"; + + std::cout << "Testing kernel: " << KERNEL_NAME << std::endl; + std::cout << "Problem size: N=" << N_ << " H=" << H_ << " W=" << W_ << " C=" << C_ + << " Window=" << Y_ << "x" << X_ << " Output=" << Ho_ << "x" << Wo_ + << std::endl; +} + +// Instantiate test suite with config-specific test parameters +// CONFIG_TEST_PARAMS is defined in the auto-generated test_params.hpp file +INSTANTIATE_TEST_SUITE_P( + PoolingVerification, + PoolingTileEngineTest2D, + ::testing::ValuesIn(CONFIG_TEST_PARAMS), + [](const ::testing::TestParamInfo& param_info) { + return "N" + std::to_string(param_info.param.N) + "_H" + + std::to_string(param_info.param.H) + "_W" + + std::to_string(param_info.param.W) + "_C" + + std::to_string(param_info.param.C) + "_Y" + + std::to_string(param_info.param.Y) + "_X" + + std::to_string(param_info.param.X); + }); diff --git a/tile_engine/CMakeLists.txt b/tile_engine/CMakeLists.txt index b9dc3201282..d1640cb300d 100644 --- a/tile_engine/CMakeLists.txt +++ b/tile_engine/CMakeLists.txt @@ -7,5 +7,6 @@ include_directories(BEFORE add_subdirectory(ops/gemm) add_subdirectory(ops/gemm_streamk) +add_subdirectory(ops/pooling) add_subdirectory(ops/reduce) diff --git a/tile_engine/ops/pooling/CMakeLists.txt b/tile_engine/ops/pooling/CMakeLists.txt new file mode 100644 index 00000000000..594f2ca6219 --- /dev/null +++ b/tile_engine/ops/pooling/CMakeLists.txt @@ -0,0 +1,205 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +# ============================================================================ +# Pooling Tile Engine Build Configuration +# +# Generates individual benchmark executables for pooling kernels +# ============================================================================ + +set(POOLING_DATATYPE "fp16;fp32" CACHE STRING "List of datatypes for Pooling (semicolon-separated)") +set(POOLING_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)") +option(ENABLE_CCACHE_POOLING "Enable ccache for pooling ops compilation" OFF) + +# Store the directory path for use in functions +set(POOLING_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}) + +# ============================================================================ +# create_individual_pool_target +# +# Creates a single benchmark executable for a specific pooling kernel config. +# ============================================================================ +function(create_individual_pool_target datatype trait tile_config config_json) + if(NOT POOLING_GPU_TARGETS) + message(WARNING "Skipping individual pooling target: No supported GPU targets") + return() + endif() + + set(target_name "benchmark_pooling_${datatype}_${trait}_${tile_config}") + set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}") + + # Generated header path + set(instance_header "${working_path}/pooling_single_pool_${datatype}_${trait}_${tile_config}.hpp") + + # Add custom command to generate the header file at build time + add_custom_command( + OUTPUT ${instance_header} + COMMAND ${Python3_EXECUTABLE} ${POOLING_SOURCE_DIR}/pooling_instance_builder.py + --working_path ${working_path} + --datatype ${datatype} + --config_json ${config_json} + --gen_single + --kernel_name "pool_${datatype}_${trait}_${tile_config}" + --tile_config "${tile_config}" + --trait_combo "${trait}" + DEPENDS ${POOLING_SOURCE_DIR}/pooling_instance_builder.py ${config_json} + COMMENT "Generating ${instance_header}" + ) + + # Create the executable + add_executable(${target_name} + ${POOLING_SOURCE_DIR}/pooling_benchmark_single.cpp + ${instance_header} + ) + + # Set GPU architectures + set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${POOLING_GPU_TARGETS}) + + # Set compile definitions + target_compile_definitions(${target_name} PRIVATE + POOLING_SINGLE_INSTANCE_HPP="${instance_header}" + ) + + # Include directories + target_include_directories(${target_name} PRIVATE + ${POOLING_SOURCE_DIR} + ${working_path} + ) + + # Compile options + target_compile_options(${target_name} PRIVATE + -Wno-undefined-func-template + -Wno-float-equal + --offload-compress + -include ${instance_header} + ) + + # Add FP8 format definitions if needed + if(CK_USE_OCP_FP8) + target_compile_options(${target_name} PRIVATE -DCK_TILE_USE_OCP_FP8) + endif() + + # Add to collection targets + add_dependencies(benchmark_pooling_all ${target_name}) + add_dependencies(benchmark_pooling_${datatype} ${target_name}) + + message(STATUS " Created pooling benchmark target: ${target_name}") +endfunction() + +# ============================================================================ +# build_individual_pool_targets +# +# Builds all benchmark targets for a specific datatype. +# ============================================================================ +function(build_individual_pool_targets datatype) + set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}") + + # Choose config file + if(DEFINED ENV{POOLING_CONFIG_FILE} AND NOT "$ENV{POOLING_CONFIG_FILE}" STREQUAL "") + set(config_filename "$ENV{POOLING_CONFIG_FILE}") + set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}") + message(STATUS " Using config from environment variable: ${config_filename}") + elseif(NOT "${POOLING_CONFIG_FILE}" STREQUAL "") + set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${POOLING_CONFIG_FILE}") + message(STATUS " Using custom config: ${POOLING_CONFIG_FILE}") + else() + set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json") + message(STATUS " Using default config for pooling") + endif() + + if(NOT EXISTS ${json_blob}) + message(FATAL_ERROR "Config file not found: ${json_blob}") + endif() + + file(MAKE_DIRECTORY ${working_path}) + + # Step 1: List kernels + message(STATUS " Listing pooling kernel configurations for ${datatype}...") + execute_process( + COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/pooling_instance_builder.py + --working_path ${working_path} + --datatype ${datatype} + --config_json ${json_blob} + --list_kernels + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + RESULT_VARIABLE ret + OUTPUT_VARIABLE list_output + ERROR_VARIABLE list_error + ) + + if(NOT ret EQUAL 0) + message(FATAL_ERROR "Failed to list pooling kernels for ${datatype}: ${list_error}") + endif() + + # Read kernel count + if(EXISTS ${working_path}/pool_kernel_count.txt) + file(READ ${working_path}/pool_kernel_count.txt kernel_count) + string(STRIP "${kernel_count}" kernel_count) + message(STATUS " Found ${kernel_count} pooling kernel configurations") + else() + message(FATAL_ERROR "Pooling kernel count file not found") + endif() + + # Step 2: Create targets + if(EXISTS ${working_path}/pool_kernel_list.txt) + file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines) + foreach(line IN LISTS kernel_lines) + string(REPLACE "|" ";" parts "${line}") + list(LENGTH parts parts_len) + if(parts_len EQUAL 3) + list(GET parts 0 kernel_name) + list(GET parts 1 tile_config) + list(GET parts 2 trait_combo) + create_individual_pool_target("${datatype}" "${trait_combo}" "${tile_config}" "${json_blob}") + endif() + endforeach() + else() + message(FATAL_ERROR "Pooling kernel list file not found") + endif() +endfunction() + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +message(STATUS "=== Starting Tile Engine Pooling Configuration ===") +message(STATUS "POOLING_DATATYPE: ${POOLING_DATATYPE}") +message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}") + +# Filter GPU targets +set(POOLING_GPU_TARGETS "") +set(DESIRED_TARGETS "gfx90a;gfx942") + +foreach(target IN LISTS SUPPORTED_GPU_TARGETS) + if(target IN_LIST DESIRED_TARGETS) + list(APPEND POOLING_GPU_TARGETS ${target}) + message(STATUS " Adding GPU target for pooling: ${target}") + endif() +endforeach() + +if(NOT POOLING_GPU_TARGETS) + message(WARNING "Skipping Tile Engine Pooling build: No supported GPU targets (gfx90a, gfx942) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}") +else() + message(STATUS "Building pooling targets for GPU targets: ${POOLING_GPU_TARGETS}") + + # Enable ccache if requested + if(ENABLE_CCACHE_POOLING) + find_program(CCACHE_PROGRAM ccache) + if(CCACHE_PROGRAM) + set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) + message(STATUS "Using ccache for pooling compilation") + endif() + endif() + + # Create collection targets + add_custom_target(benchmark_pooling_all) + + foreach(dt IN LISTS POOLING_DATATYPE) + add_custom_target(benchmark_pooling_${dt}) + endforeach() + + # Build targets for each datatype + foreach(dt IN LISTS POOLING_DATATYPE) + build_individual_pool_targets(${dt}) + endforeach() +endif() diff --git a/tile_engine/ops/pooling/configs/default_config.json b/tile_engine/ops/pooling/configs/default_config.json new file mode 100644 index 00000000000..89af488eb61 --- /dev/null +++ b/tile_engine/ops/pooling/configs/default_config.json @@ -0,0 +1,21 @@ +{ + "problem": { + "description": "Default pooling configuration for tile_engine benchmarks" + }, + "tile_config": { + "block_m": {"values": [128]}, + "block_n": {"values": [1]}, + "warp_m": {"values": [1]}, + "warp_n": {"values": [1]}, + "warp_tile_m": {"values": [128]}, + "warp_tile_n": {"values": [1]}, + "thread_tile_m": {"values": [2]}, + "thread_tile_n": {"values": [1]} + }, + "trait_config": { + "reduce_op": {"values": ["max"]}, + "output_index": {"values": [true]}, + "propagate_nan": {"values": [false]}, + "pooling_dim": {"values": ["2d"]} + } +} diff --git a/tile_engine/ops/pooling/pooling_benchmark.hpp b/tile_engine/ops/pooling/pooling_benchmark.hpp new file mode 100644 index 00000000000..8376571b4fa --- /dev/null +++ b/tile_engine/ops/pooling/pooling_benchmark.hpp @@ -0,0 +1,133 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/pooling.hpp" +#include "ck_tile/host/reference/reference_pool.hpp" + +namespace ck_tile { + +/// @brief Performance metrics for benchmarking +enum class PoolMetric +{ + LATENCY, + BANDWIDTH +}; + +/// @brief Pooling problem specification for 2D pooling +struct PoolProblem2D +{ + index_t N, H, W, C; // Input dimensions (NHWC) + index_t Y, X; // Window dimensions + index_t stride_h, stride_w; // Window strides + index_t dilation_h, dilation_w; // Window dilations + index_t pad_h_left, pad_h_right; // Height padding + index_t pad_w_left, pad_w_right; // Width padding + std::string datatype; // Data type name + std::string reduce_op; // "max" or "avg" + + index_t Ho() const + { + index_t Ys = (Y - 1) * dilation_h + 1; + return (H + pad_h_left + pad_h_right - Ys) / stride_h + 1; + } + + index_t Wo() const + { + index_t Xs = (X - 1) * dilation_w + 1; + return (W + pad_w_left + pad_w_right - Xs) / stride_w + 1; + } + + index_t input_elements() const { return N * H * W * C; } + index_t output_elements() const { return N * Ho() * Wo() * C; } + + std::string to_string() const + { + std::ostringstream oss; + oss << "N" << N << "_H" << H << "_W" << W << "_C" << C << "_Y" << Y << "_X" << X + << "_Sh" << stride_h << "_Sw" << stride_w << "_Dh" << dilation_h << "_Dw" + << dilation_w; + if(pad_h_left > 0 || pad_w_left > 0) + oss << "_Ph" << pad_h_left << "_Pw" << pad_w_left; + return oss.str(); + } +}; + +/// @brief Pooling problem specification for 3D pooling +struct PoolProblem3D +{ + index_t N, D, H, W, C; // Input dimensions (NDHWC) + index_t Z, Y, X; // Window dimensions + index_t stride_d, stride_h, stride_w; // Window strides + index_t dilation_d, dilation_h, dilation_w; // Window dilations + index_t pad_d_left, pad_d_right; // Depth padding + index_t pad_h_left, pad_h_right; // Height padding + index_t pad_w_left, pad_w_right; // Width padding + std::string datatype; // Data type name + std::string reduce_op; // "max" or "avg" + + index_t Do() const + { + index_t Zs = (Z - 1) * dilation_d + 1; + return (D + pad_d_left + pad_d_right - Zs) / stride_d + 1; + } + + index_t Ho() const + { + index_t Ys = (Y - 1) * dilation_h + 1; + return (H + pad_h_left + pad_h_right - Ys) / stride_h + 1; + } + + index_t Wo() const + { + index_t Xs = (X - 1) * dilation_w + 1; + return (W + pad_w_left + pad_w_right - Xs) / stride_w + 1; + } + + index_t input_elements() const { return N * D * H * W * C; } + index_t output_elements() const { return N * Do() * Ho() * Wo() * C; } + + std::string to_string() const + { + std::ostringstream oss; + oss << "N" << N << "_D" << D << "_H" << H << "_W" << W << "_C" << C << "_Z" << Z + << "_Y" << Y << "_X" << X; + return oss.str(); + } +}; + +/// @brief Performance result for a pooling kernel +struct PoolPerformanceResult +{ + float latency_ms; + float bandwidth_gb_s; + + std::string to_string() const + { + std::ostringstream oss; + oss << "latency=" << latency_ms << "ms, bandwidth=" << bandwidth_gb_s << "GB/s"; + return oss.str(); + } +}; + +/// @brief Benchmark settings +struct PoolBenchmarkSetting +{ + int warmup = 5; + int repeat = 20; + bool verify = true; + int init_method = + 0; // 0: uniform random, 1: integer sequence, 2: constant, 3: special +}; + +} // namespace ck_tile diff --git a/tile_engine/ops/pooling/pooling_benchmark_single.cpp b/tile_engine/ops/pooling/pooling_benchmark_single.cpp new file mode 100644 index 00000000000..5a5cff0ea91 --- /dev/null +++ b/tile_engine/ops/pooling/pooling_benchmark_single.cpp @@ -0,0 +1,204 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +/** + * @file pooling_benchmark_single.cpp + * @brief Single-kernel benchmark for pooling operations. + * + * This benchmark includes the generated kernel header via -include flag + * and runs the pooling kernel with specified problem sizes. + */ + +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/pooling.hpp" +#include "ck_tile/host/reference/reference_pool.hpp" +#include "tile_engine/ops/pooling/pooling_common.hpp" +#include "tile_engine/ops/pooling/pooling_benchmark.hpp" + +// The kernel header is included via compile command line with -include flag +// It defines: SelectedKernel, KERNEL_NAME, InDataType, OutDataType, etc. + +static ck_tile::ArgParser create_args() +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("n", "1", "Batch size (N)") + .insert("h", "16", "Input height (H)") + .insert("w", "16", "Input width (W)") + .insert("c", "32", "Channels (C)") + .insert("wy", "2", "Window height (Y)") + .insert("wx", "2", "Window width (X)") + .insert("sy", "2", "Window stride height") + .insert("sx", "2", "Window stride width") + .insert("dy", "1", "Window dilation height") + .insert("dx", "1", "Window dilation width") + .insert("phy", "0", "Padding height left") + .insert("phyr", "0", "Padding height right") + .insert("pwx", "0", "Padding width left") + .insert("pwxr", "0", "Padding width right") + .insert("verify", "1", "Verify results (0/1)") + .insert("warmup", "5", "Warmup iterations") + .insert("repeat", "20", "Repeat iterations") + .insert("log", "1", "Log level"); + return arg_parser; +} + +int benchmark_pooling_single(int argc, char* argv[]) +{ + auto arg_parser = create_args(); + bool result = arg_parser.parse(argc, argv); + if(!result) + return -1; + + // Parse problem dimensions + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t H = arg_parser.get_int("h"); + ck_tile::index_t W = arg_parser.get_int("w"); + ck_tile::index_t C = arg_parser.get_int("c"); + ck_tile::index_t Y = arg_parser.get_int("wy"); + ck_tile::index_t X = arg_parser.get_int("wx"); + ck_tile::index_t Sy = arg_parser.get_int("sy"); + ck_tile::index_t Sx = arg_parser.get_int("sx"); + ck_tile::index_t Dy = arg_parser.get_int("dy"); + ck_tile::index_t Dx = arg_parser.get_int("dx"); + ck_tile::index_t LeftPy = arg_parser.get_int("phy"); + ck_tile::index_t RightPy = arg_parser.get_int("phyr"); + ck_tile::index_t LeftPx = arg_parser.get_int("pwx"); + ck_tile::index_t RightPx = arg_parser.get_int("pwxr"); + + bool verify = arg_parser.get_int("verify") != 0; + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + int log_level = arg_parser.get_int("log"); + + // Calculate output dimensions + ck_tile::index_t Ys = (Y - 1) * Dy + 1; + ck_tile::index_t Xs = (X - 1) * Dx + 1; + ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1; + ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1; + + std::cout << "Pooling benchmark: " << KERNEL_NAME << std::endl; + std::cout << " Input: NHWC = " << N << "x" << H << "x" << W << "x" << C << std::endl; + std::cout << " Output: NHWC = " << N << "x" << Ho << "x" << Wo << "x" << C << std::endl; + std::cout << " Window: " << Y << "x" << X << ", stride: " << Sy << "x" << Sx + << ", dilation: " << Dy << "x" << Dx << std::endl; + + // Create host tensors + ck_tile::HostTensor h_in({N, H, W, C}); + ck_tile::HostTensor h_out({N, Ho, Wo, C}); + ck_tile::HostTensor h_out_ref({N, Ho, Wo, C}); + ck_tile::HostTensor h_out_index({N, Ho, Wo, C}); + ck_tile::HostTensor h_out_ref_index({N, Ho, Wo, C}); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(h_in); + + // Device memory + ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes()); + ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes()); + ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes()); + + d_in.ToDevice(h_in.data()); + d_out.SetZero(); + d_out_index.SetZero(); + + // Build host args + const auto input_shape = ck_tile::make_tuple(N, H, W, C); + const auto output_shape = ck_tile::make_tuple(N, Ho, Wo, C); + const auto input_strides = ck_tile::make_tuple(H * W * C, W * C, C, 1); + const auto output_strides = ck_tile::make_tuple(Ho * Wo * C, Wo * C, C, 1); + const auto window_lengths = ck_tile::make_tuple(Y, X); + const auto window_strides = ck_tile::make_tuple(Sy, Sx); + const auto window_dilations = ck_tile::make_tuple(Dy, Dx); + const auto input_left_pads = ck_tile::make_tuple(LeftPy, LeftPx); + const auto input_right_pads = ck_tile::make_tuple(RightPy, RightPx); + + auto host_args = + ck_tile::PoolHostArgs{ + d_in.GetDeviceBuffer(), + d_out.GetDeviceBuffer(), + d_out_index.GetDeviceBuffer(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; + + // Stream configuration + ck_tile::stream_config stream{nullptr, true, log_level, warmup, repeat}; + + // Launch kernel + float latency = 0; + try + { + latency = SelectedKernel::launch(host_args, stream); + } + catch(const std::exception& e) + { + std::cerr << "Kernel launch failed: " << e.what() << std::endl; + return -1; + } + + // Calculate bandwidth + size_t bytes_read = static_cast(N) * H * W * C * sizeof(InDataType); + size_t bytes_written = static_cast(N) * Ho * Wo * C * sizeof(OutDataType); + float bandwidth = (bytes_read + bytes_written) / (latency * 1e-3f) / 1e9f; + + std::cout << " Latency: " << latency << " ms" << std::endl; + std::cout << " Bandwidth: " << bandwidth << " GB/s" << std::endl; + + // Verify if requested + if(verify) + { + d_out.FromDevice(h_out.data()); + d_out_index.FromDevice(h_out_index.data()); + + auto kernel_args = + ck_tile::PoolKernelArgs{ + h_in.data(), + h_out_ref.data(), + h_out_ref_index.data(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; + + ck_tile::reference_pool2d( + h_in, h_out_ref, h_out_ref_index, kernel_args, ReduceOpType{}); + + bool pass_value = + ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-3, 1e-3); + std::cout << " Verification: " << (pass_value ? "PASS" : "FAIL") << std::endl; + + if(SelectedKernel::kOutputIndex) + { + bool pass_index = ck_tile::check_err( + h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0); + std::cout << " Index verification: " << (pass_index ? "PASS" : "FAIL") + << std::endl; + } + } + + return 0; +} + +int main(int argc, char* argv[]) { return benchmark_pooling_single(argc, argv); } diff --git a/tile_engine/ops/pooling/pooling_common.hpp b/tile_engine/ops/pooling/pooling_common.hpp new file mode 100644 index 00000000000..2778e960cfd --- /dev/null +++ b/tile_engine/ops/pooling/pooling_common.hpp @@ -0,0 +1,48 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/pooling.hpp" + +namespace ck_tile { + +/// @brief Kernel trait parameters for pooling tile_engine configurations +struct PoolingKernelTraits +{ + std::string reduce_op; // "max" or "avg" + bool output_index; // Whether to output indices (max pooling) + bool propagate_nan; // Whether to propagate NaN values + bool cross_warp; // Whether cross-warp reduction is used + + std::string to_string() const + { + std::ostringstream oss; + oss << reduce_op << "_" + << (output_index ? "idx" : "noidx") << "_" + << (propagate_nan ? "nan" : "nonan") << "_" + << (cross_warp ? "crosswarp" : "nocrosswarp"); + return oss.str(); + } +}; + +/// @brief Extract traits from a kernel name string +inline PoolingKernelTraits extract_pooling_traits_from_name(const std::string& name) +{ + PoolingKernelTraits traits; + traits.reduce_op = (name.find("max") != std::string::npos) ? "max" : "avg"; + traits.output_index = (name.find("idx") != std::string::npos) && + (name.find("noidx") == std::string::npos); + traits.propagate_nan = (name.find("nan") != std::string::npos) && + (name.find("nonan") == std::string::npos); + traits.cross_warp = (name.find("crosswarp") != std::string::npos) && + (name.find("nocrosswarp") == std::string::npos); + return traits; +} + +} // namespace ck_tile diff --git a/tile_engine/ops/pooling/pooling_instance_builder.py b/tile_engine/ops/pooling/pooling_instance_builder.py new file mode 100644 index 00000000000..8bab7d91057 --- /dev/null +++ b/tile_engine/ops/pooling/pooling_instance_builder.py @@ -0,0 +1,579 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +""" +Pooling kernel instance builder for tile_engine. + +Generates C++ kernel headers for pooling operations with specific tile +configurations and trait combinations. + +Usage: + --list_kernels: List valid kernel configurations + --gen_single: Generate a single kernel header + --gen_individual: Generate all kernel headers +""" + +import os +import json +import argparse +import itertools +import multiprocessing +import concurrent.futures +from pathlib import Path +import logging +from typing import Optional + +from pooling_validation_utils import is_tile_config_valid, is_trait_combination_valid + +logging.basicConfig(level=logging.INFO) + + +class PoolingKernelBuilder: + def __init__(self, working_path, datatype, config_json=None): + self.working_path = Path(working_path) + self.datatype = datatype + self.config_json = config_json + + # Create working directory if it doesn't exist + self.working_path.mkdir(parents=True, exist_ok=True) + + # Load configuration + if config_json and os.path.exists(config_json): + with open(config_json, "r") as f: + self.config = json.load(f) + else: + self.config = self._get_default_config() + + def _get_default_config(self): + """Return default configuration if no config file is provided""" + return { + "tile_config": { + "block_m": {"values": [128]}, + "block_n": {"values": [1]}, + "warp_m": {"values": [1]}, + "warp_n": {"values": [1]}, + "warp_tile_m": {"values": [128]}, + "warp_tile_n": {"values": [1]}, + "thread_tile_m": {"values": [2]}, + "thread_tile_n": {"values": [1]}, + }, + "trait_config": { + "reduce_op": {"values": ["max"]}, + "output_index": {"values": [True]}, + "propagate_nan": {"values": [False]}, + "pooling_dim": {"values": ["2d"]}, + }, + } + + def _get_tile_configs(self, fast_mode=False): + """Get tile configurations from config""" + if "tile_config" not in self.config: + return [] + + tile_config = self.config["tile_config"] + + block_m_values = tile_config.get("block_m", {}).get("values", [128]) + block_n_values = tile_config.get("block_n", {}).get("values", [1]) + warp_m_values = tile_config.get("warp_m", {}).get("values", [1]) + warp_n_values = tile_config.get("warp_n", {}).get("values", [1]) + warp_tile_m_values = tile_config.get("warp_tile_m", {}).get("values", [128]) + warp_tile_n_values = tile_config.get("warp_tile_n", {}).get("values", [1]) + thread_tile_m_values = tile_config.get("thread_tile_m", {}).get("values", [2]) + thread_tile_n_values = tile_config.get("thread_tile_n", {}).get("values", [1]) + + configs = [] + for block_m in block_m_values: + for block_n in block_n_values: + for warp_m in warp_m_values: + for warp_n in warp_n_values: + for warp_tile_m in warp_tile_m_values: + for warp_tile_n in warp_tile_n_values: + for thread_tile_m in thread_tile_m_values: + for thread_tile_n in thread_tile_n_values: + if self._validate_tile_config( + block_m, + block_n, + warp_m, + warp_n, + warp_tile_m, + warp_tile_n, + thread_tile_m, + thread_tile_n, + fast_mode=fast_mode, + ): + configs.append( + { + "block_m": block_m, + "block_n": block_n, + "warp_m": warp_m, + "warp_n": warp_n, + "warp_tile_m": warp_tile_m, + "warp_tile_n": warp_tile_n, + "thread_tile_m": thread_tile_m, + "thread_tile_n": thread_tile_n, + } + ) + return configs + + def _validate_tile_config( + self, + block_m, + block_n, + warp_m, + warp_n, + warp_tile_m, + warp_tile_n, + thread_tile_m, + thread_tile_n, + fast_mode=False, + ): + """Validate that tile configuration is reasonable""" + if fast_mode: + # Basic sanity checks only + if any( + v <= 0 + for v in [ + block_m, + block_n, + warp_m, + warp_n, + warp_tile_m, + warp_tile_n, + thread_tile_m, + thread_tile_n, + ] + ): + return False + if warp_tile_m % thread_tile_m != 0: + return False + if warp_tile_n % thread_tile_n != 0: + return False + return True + else: + # Determine data types + in_datatype = self.datatype + out_datatype = self.datatype + + return is_tile_config_valid( + block_m, + block_n, + warp_m, + warp_n, + warp_tile_m, + warp_tile_n, + thread_tile_m, + thread_tile_n, + in_datatype, + out_datatype, + ) + + def _generate_trait_combinations(self): + """Generate all combinations of traits""" + if "trait_config" not in self.config: + return [("max", True, False, "2d")] + + trait_config = self.config["trait_config"] + + reduce_ops = trait_config.get("reduce_op", {}).get("values", ["max"]) + output_indices = trait_config.get("output_index", {}).get("values", [True]) + propagate_nans = trait_config.get("propagate_nan", {}).get("values", [False]) + pooling_dims = trait_config.get("pooling_dim", {}).get("values", ["2d"]) + + all_combinations = list( + itertools.product(reduce_ops, output_indices, propagate_nans, pooling_dims) + ) + + # Filter valid combinations + combinations = [] + for combo in all_combinations: + reduce_op, output_index, propagate_nan, pooling_dim = combo + if is_trait_combination_valid( + reduce_op, output_index, propagate_nan, pooling_dim + ): + combinations.append(combo) + else: + logging.debug( + f"Skipping unsupported trait combination: {reduce_op}-{output_index}-{propagate_nan}-{pooling_dim}" + ) + + return combinations + + def _get_dtype_string(self): + """Get C++ type string for datatype""" + dtype_map = { + "fp16": "ck_tile::fp16_t", + "bf16": "ck_tile::bf16_t", + "fp32": "float", + "fp64": "double", + } + return dtype_map.get(self.datatype, "float") + + def _get_reduce_op_string(self, reduce_op): + """Get C++ reduce op type string""" + reduce_op_map = { + "max": "ck_tile::ReduceOp::Max", + "avg": "ck_tile::ReduceOp::Add", + } + return reduce_op_map.get(reduce_op, "ck_tile::ReduceOp::Max") + + def _generate_kernel_instance(self, tile_config, trait_combo, is_header=True): + """Generate a single kernel instance header""" + reduce_op, output_index, propagate_nan, pooling_dim = trait_combo + + # Create kernel name + kernel_name = ( + f"pool_{self.datatype}_{pooling_dim}_{reduce_op}_" + f"{'idx' if output_index else 'noidx'}_" + f"{'nan' if propagate_nan else 'nonan'}" + ) + + # Create tile configuration string + tile_str = ( + f"{tile_config['block_m']}x{tile_config['block_n']}_" + f"{tile_config['warp_m']}x{tile_config['warp_n']}_" + f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}_" + f"{tile_config['thread_tile_m']}x{tile_config['thread_tile_n']}" + ) + + kernel_name += f"_{tile_str}" + + # Determine types + in_type = self._get_dtype_string() + out_type = in_type + compute_type = "float" # Always use float for computation + index_type = "ck_tile::index_t" + reduce_op_type = self._get_reduce_op_string(reduce_op) + + output_index_str = "true" if output_index else "false" + propagate_nan_str = "true" if propagate_nan else "false" + + # Generate 2D or 3D specific code + if pooling_dim == "2d": + tensor_shape_type = "ck_tile::tuple" + window_shape_type = "ck_tile::tuple" + reference_func = "ck_tile::reference_pool2d" + tensor_rank = 4 + window_rank = 2 + else: + tensor_shape_type = "ck_tile::tuple" + window_shape_type = "ck_tile::tuple" + reference_func = "ck_tile::reference_pool3d" + tensor_rank = 5 + window_rank = 3 + + pragma_line = "#pragma once\n" if is_header else "" + instance_code = f"""// Generated kernel instance for {kernel_name} +{pragma_line} +#include +#include +#include +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/pooling.hpp" + +using InDataType = {in_type}; +using OutDataType = {out_type}; +using ComputeDataType = {compute_type}; +using IndexDataType = {index_type}; +using ReduceOpType = {reduce_op_type}; + +using TensorShape = {tensor_shape_type}; +using WindowShape = {window_shape_type}; + +// Kernel name for display +constexpr const char* KERNEL_NAME = "{kernel_name}"; +constexpr int POOLING_DIM = {window_rank}; + +// Wrapper for simplified launch interface +struct SelectedKernel {{ + // Tile configuration - PoolShape parameters + static constexpr ck_tile::index_t Block_M = {tile_config["block_m"]}; + static constexpr ck_tile::index_t Block_N = {tile_config["block_n"]}; + static constexpr ck_tile::index_t WarpPerBlock_M = {tile_config["warp_m"]}; + static constexpr ck_tile::index_t WarpPerBlock_N = {tile_config["warp_n"]}; + static constexpr ck_tile::index_t WarpTile_M = {tile_config["warp_tile_m"]}; + static constexpr ck_tile::index_t WarpTile_N = {tile_config["warp_tile_n"]}; + static constexpr ck_tile::index_t ThreadTile_M = {tile_config["thread_tile_m"]}; + static constexpr ck_tile::index_t ThreadTile_N = {tile_config["thread_tile_n"]}; + + // Traits + static constexpr bool kOutputIndex = {output_index_str}; + static constexpr bool kPropagateNan = {propagate_nan_str}; + + // Pool shape + using BlockWarps = ck_tile::sequence; + using BlockTile = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using ThreadTile = ck_tile::sequence; + + using PoolShapeType = ck_tile::PoolShape; + + // Problem and kernel types + using Problem = ck_tile::PoolProblem; + using Kernel = ck_tile::PoolKernel; + + static float launch(ck_tile::PoolHostArgs& args, + const ck_tile::stream_config& stream) {{ + + constexpr ck_tile::index_t kBlockPerCu = 1; + const ck_tile::index_t kBlockSize = Kernel::BlockSize(); + + auto kernel_args = Kernel::MakeKernelArgs(args); + + if (!Kernel::IsSupportedArgument(kernel_args)) {{ + throw std::runtime_error("Wrong! Arguments not supported! Skipping pooling!"); + }} + + const ck_tile::index_t kGridSize = Kernel::CalculateGridSize(kernel_args); + + if(stream.log_level_ > 0) {{ + std::cout << "Launching pooling kernel: " << KERNEL_NAME << "\\n" + << " grid_size: " << kGridSize << ", block_size: " << kBlockSize + << std::endl; + }} + + return ck_tile::launch_kernel( + stream, + ck_tile::make_kernel(Kernel{{}}, kGridSize, kBlockSize, 0, kernel_args)); + }} +}}; +""" + return kernel_name, instance_code + + def write_kernel_list(self): + """Write kernel list to file for CMake to read""" + tile_configs = self._get_tile_configs(fast_mode=False) + trait_combos = self._generate_trait_combinations() + + kernel_list = [] + for tile_config in tile_configs: + for trait_combo in trait_combos: + reduce_op, output_index, propagate_nan, pooling_dim = trait_combo + + kernel_name = ( + f"pool_{self.datatype}_{pooling_dim}_{reduce_op}_" + f"{'idx' if output_index else 'noidx'}_" + f"{'nan' if propagate_nan else 'nonan'}" + ) + + tile_str = ( + f"{tile_config['block_m']}x{tile_config['block_n']}_" + f"{tile_config['warp_m']}x{tile_config['warp_n']}_" + f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}_" + f"{tile_config['thread_tile_m']}x{tile_config['thread_tile_n']}" + ) + + kernel_name += f"_{tile_str}" + + trait_str = ( + f"{reduce_op}_" + f"{'true' if output_index else 'false'}_" + f"{'true' if propagate_nan else 'false'}_" + f"{pooling_dim}" + ) + + kernel_list.append( + { + "name": kernel_name, + "tile_config": tile_config, + "trait_combo": trait_combo, + "tile_str": tile_str, + "trait_str": trait_str, + } + ) + + # Write kernel count + with open(self.working_path / "pool_kernel_count.txt", "w") as f: + f.write(str(len(kernel_list))) + + # Write kernel list + with open(self.working_path / "pool_kernel_list.txt", "w") as f: + for kernel in kernel_list: + f.write( + f"{kernel['name']}|{kernel['tile_str']}|{kernel['trait_str']}\n" + ) + + print(f"Listed {len(kernel_list)} kernel configurations") + + def generate_individual(self, num_workers=None): + """Generate individual kernel files with parallel processing""" + if num_workers is None: + num_workers = min(multiprocessing.cpu_count(), 8) + + tile_configs = self._get_tile_configs() + trait_combos = self._generate_trait_combinations() + + work_items = [] + for tile_config in tile_configs: + for trait_combo in trait_combos: + work_items.append( + ( + tile_config, + trait_combo, + self.working_path, + self.datatype, + ) + ) + + print( + f"Generating {len(work_items)} individual kernel files using {num_workers} workers..." + ) + + kernel_list = [] + completed = 0 + + with concurrent.futures.ProcessPoolExecutor( + max_workers=num_workers + ) as executor: + future_to_item = { + executor.submit(_generate_single_kernel_individual, item): item + for item in work_items + } + + for future in concurrent.futures.as_completed(future_to_item): + completed += 1 + if completed % 10 == 0 or completed == len(work_items): + print( + f" Progress: {completed}/{len(work_items)} kernels generated" + ) + + try: + result = future.result() + if result: + kernel_list.append(result) + except Exception as exc: + item = future_to_item[future] + print(f"Kernel generation failed for {item}: {exc}") + + kernel_list.sort(key=lambda x: x[0]) + print( + f"Generated {len(kernel_list)} individual kernel files in {self.working_path}" + ) + + def run(self, num_workers=None): + """Run the builder to generate individual kernel files""" + self.generate_individual(num_workers) + + +def _generate_single_kernel_individual(work_item): + """Worker function to generate a single individual kernel file""" + tile_config, trait_combo, working_path, datatype = work_item + + builder = PoolingKernelBuilder(working_path, datatype) + + try: + kernel_name, instance_code = builder._generate_kernel_instance( + tile_config, trait_combo + ) + + header_file = working_path / f"pooling_single_{kernel_name}.hpp" + with open(header_file, "w") as f: + f.write(instance_code) + + return (kernel_name, trait_combo, tile_config) + except Exception as e: + print(f"Error generating individual kernel: {e}") + return None + + +def main(): + parser = argparse.ArgumentParser( + description="Pooling kernel instance builder for tile_engine" + ) + parser.add_argument("--working_path", required=True, help="Working directory path") + parser.add_argument( + "--datatype", + required=True, + choices=["fp16", "bf16", "fp32"], + help="Data type", + ) + parser.add_argument("--config_json", help="Configuration JSON file") + parser.add_argument( + "--num_workers", type=int, help="Number of parallel workers (default: auto)" + ) + parser.add_argument( + "--gen_individual", action="store_true", help="Generate individual kernel files" + ) + parser.add_argument( + "--gen_single", action="store_true", help="Generate a single kernel file" + ) + parser.add_argument("--kernel_name", help="Kernel name for single generation") + parser.add_argument( + "--tile_config", help="Tile configuration string for single generation" + ) + parser.add_argument( + "--trait_combo", help="Trait combination string for single generation" + ) + parser.add_argument( + "--list_kernels", + action="store_true", + help="List kernel configurations without generating files", + ) + + args = parser.parse_args() + + builder = PoolingKernelBuilder(args.working_path, args.datatype, args.config_json) + + if args.list_kernels: + builder.write_kernel_list() + elif args.gen_single: + if not args.kernel_name or not args.tile_config or not args.trait_combo: + parser.error( + "--gen_single requires --kernel_name, --tile_config, and --trait_combo" + ) + + # Parse tile config: "block_mx block_n_warp_mxwarp_n_warp_tile_mxwarp_tile_n_thread_tile_mxthread_tile_n" + tile_parts = args.tile_config.split("_") + block_dims = tile_parts[0].split("x") + warp_dims = tile_parts[1].split("x") + warp_tile_dims = tile_parts[2].split("x") + thread_tile_dims = tile_parts[3].split("x") + + tile_config = { + "block_m": int(block_dims[0]), + "block_n": int(block_dims[1]), + "warp_m": int(warp_dims[0]), + "warp_n": int(warp_dims[1]), + "warp_tile_m": int(warp_tile_dims[0]), + "warp_tile_n": int(warp_tile_dims[1]), + "thread_tile_m": int(thread_tile_dims[0]), + "thread_tile_n": int(thread_tile_dims[1]), + } + + # Parse trait combo: "reduce_op_output_index_propagate_nan_pooling_dim" + trait_parts = args.trait_combo.split("_") + trait_combo = ( + trait_parts[0], # reduce_op + trait_parts[1].lower() == "true", # output_index + trait_parts[2].lower() == "true", # propagate_nan + trait_parts[3], # pooling_dim + ) + + kernel_name, instance_code = builder._generate_kernel_instance( + tile_config, trait_combo + ) + + header_file = builder.working_path / f"pooling_single_{kernel_name}.hpp" + with open(header_file, "w") as f: + f.write(instance_code) + + print(f"Generated {header_file}") + + elif args.gen_individual: + builder.run(args.num_workers) + else: + parser.error( + "Must specify one of: --list_kernels, --gen_individual, or --gen_single" + ) + + +if __name__ == "__main__": + main() diff --git a/tile_engine/ops/pooling/pooling_profiler.hpp b/tile_engine/ops/pooling/pooling_profiler.hpp new file mode 100644 index 00000000000..d98fd1f59c2 --- /dev/null +++ b/tile_engine/ops/pooling/pooling_profiler.hpp @@ -0,0 +1,149 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/pooling.hpp" +#include "ck_tile/host/reference/reference_pool.hpp" +#include "tile_engine/ops/pooling/pooling_benchmark.hpp" + +namespace ck_tile { + +/// @brief Profiler for pooling kernels. +/// +/// Handles tensor setup, kernel launch, reference computation, and verification +/// for 2D pooling benchmarks. +template +class PoolProfiler2D +{ + public: + PoolProfiler2D(const PoolBenchmarkSetting& setting) : setting_(setting) {} + + /// @brief Benchmark a 2D pooling kernel + /// @param problem The pooling problem specification + /// @param kernel_func Function that launches the kernel and returns latency + template + PoolPerformanceResult benchmark(const PoolProblem2D& problem, KernelFunc kernel_func) + { + const index_t Ho = problem.Ho(); + const index_t Wo = problem.Wo(); + + // Create host tensors + HostTensor h_in({problem.N, problem.H, problem.W, problem.C}); + HostTensor h_out({problem.N, Ho, Wo, problem.C}); + HostTensor h_out_ref({problem.N, Ho, Wo, problem.C}); + HostTensor h_out_index({problem.N, Ho, Wo, problem.C}); + HostTensor h_out_ref_index({problem.N, Ho, Wo, problem.C}); + + // Initialize + FillUniformDistribution{-5.f, 5.f}(h_in); + h_out.SetZero(); + h_out_ref.SetZero(); + + // Device memory + DeviceMem d_in(h_in.get_element_space_size_in_bytes()); + DeviceMem d_out(h_out.get_element_space_size_in_bytes()); + DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes()); + + d_in.ToDevice(h_in.data()); + d_out.SetZero(); + d_out_index.SetZero(); + + // Build kernel args + const auto input_shape = + make_tuple(problem.N, problem.H, problem.W, problem.C); + const auto output_shape = make_tuple(problem.N, Ho, Wo, problem.C); + const auto input_strides = make_tuple( + problem.H * problem.W * problem.C, problem.W * problem.C, problem.C, 1); + const auto output_strides = + make_tuple(Ho * Wo * problem.C, Wo * problem.C, problem.C, 1); + const auto window_lengths = make_tuple(problem.Y, problem.X); + const auto window_strides = make_tuple(problem.stride_h, problem.stride_w); + const auto window_dilations = make_tuple(problem.dilation_h, problem.dilation_w); + const auto input_left_pads = make_tuple(problem.pad_h_left, problem.pad_w_left); + const auto input_right_pads = + make_tuple(problem.pad_h_right, problem.pad_w_right); + + auto host_args = + PoolHostArgs{ + d_in.GetDeviceBuffer(), + d_out.GetDeviceBuffer(), + d_out_index.GetDeviceBuffer(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; + + // Launch kernel + float latency = kernel_func(host_args); + + // Copy back + d_out.FromDevice(h_out.data()); + d_out_index.FromDevice(h_out_index.data()); + + // Verify if requested + if(setting_.verify) + { + auto kernel_args_ref = + PoolKernelArgs{ + h_in.data(), + h_out_ref.data(), + h_out_ref_index.data(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; + + // Use ReduceOp::Max as default for reference + using ReduceOp = ReduceOp::Max; + reference_pool2d( + h_in, h_out_ref, h_out_ref_index, kernel_args_ref, ReduceOp{}); + + bool pass = + check_err(h_out, h_out_ref, "Error: Incorrect results!", 1e-3, 1e-3); + if(!pass) + { + std::cerr << "Verification FAILED!" << std::endl; + } + else + { + std::cout << "Verification PASSED" << std::endl; + } + } + + // Calculate bandwidth + size_t bytes_read = problem.input_elements() * sizeof(InDataType); + size_t bytes_written = problem.output_elements() * sizeof(OutDataType); + float bandwidth = (bytes_read + bytes_written) / (latency * 1e-3f) / 1e9f; + + return PoolPerformanceResult{latency, bandwidth}; + } + + private: + PoolBenchmarkSetting setting_; +}; + +} // namespace ck_tile diff --git a/tile_engine/ops/pooling/pooling_validation_utils.py b/tile_engine/ops/pooling/pooling_validation_utils.py new file mode 100644 index 00000000000..083710d3467 --- /dev/null +++ b/tile_engine/ops/pooling/pooling_validation_utils.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +""" +Validation utilities for pooling tile_engine configurations. +Validates tile configurations and trait combinations for pooling kernels. +""" + +import logging + +logging.basicConfig(level=logging.INFO) + +WARP_SIZE = 64 # AMD wavefront size + + +def is_tile_config_valid( + block_m, + block_n, + warp_m, + warp_n, + warp_tile_m, + warp_tile_n, + thread_tile_m, + thread_tile_n, + in_datatype, + out_datatype, +): + """ + Validate a pooling tile configuration. + + For pooling, the 2D tile is: + M = output elements (N*Ho*Wo*C for 2D, N*Do*Ho*Wo*C for 3D) + N = reduction dimension (window elements: Y*X for 2D, Z*Y*X for 3D) + + BlockShape params: + BlockWarps = (warp_m, warp_n) + BlockTile = (block_m, block_n) + WarpTile = (warp_tile_m, warp_tile_n) + ThreadTile = (thread_tile_m, thread_tile_n) + """ + + # Basic positivity checks + if any( + v <= 0 + for v in [ + block_m, + block_n, + warp_m, + warp_n, + warp_tile_m, + warp_tile_n, + thread_tile_m, + thread_tile_n, + ] + ): + logging.debug("All tile parameters must be positive") + return False + + # WarpTile must be divisible by ThreadTile + if warp_tile_m % thread_tile_m != 0: + logging.debug( + f"warp_tile_m ({warp_tile_m}) must be divisible by thread_tile_m ({thread_tile_m})" + ) + return False + if warp_tile_n % thread_tile_n != 0: + logging.debug( + f"warp_tile_n ({warp_tile_n}) must be divisible by thread_tile_n ({thread_tile_n})" + ) + return False + + # WarpTile / ThreadTile product must be multiple of warp size + threads_per_warp = (warp_tile_m * warp_tile_n) // (thread_tile_m * thread_tile_n) + if threads_per_warp % WARP_SIZE != 0: + logging.debug( + f"warp_tile product / thread_tile product ({threads_per_warp}) " + f"must be multiple of WARP_SIZE ({WARP_SIZE})" + ) + return False + + # Calculate WarpSizeScaleFactor + warp_size_scale_factor = threads_per_warp // WARP_SIZE + + if warp_tile_m // thread_tile_m > warp_tile_n // thread_tile_n: + warp_size_scale_factor_m = warp_size_scale_factor + warp_size_scale_factor_n = 1 + else: + warp_size_scale_factor_m = 1 + warp_size_scale_factor_n = warp_size_scale_factor + + # Block dimensions must be properly divisible + if (block_m * warp_size_scale_factor_m) % (warp_m * warp_tile_m) != 0: + logging.debug( + f"block_m*scale ({block_m * warp_size_scale_factor_m}) " + f"must be divisible by warp_m*warp_tile_m ({warp_m * warp_tile_m})" + ) + return False + if (block_n * warp_size_scale_factor_n) % (warp_n * warp_tile_n) != 0: + logging.debug( + f"block_n*scale ({block_n * warp_size_scale_factor_n}) " + f"must be divisible by warp_n*warp_tile_n ({warp_n * warp_tile_n})" + ) + return False + + # BlockSize = WARP_SIZE * warp_m * warp_n; should be reasonable + block_size = WARP_SIZE * warp_m * warp_n + if block_size > 1024: + logging.debug(f"BlockSize ({block_size}) exceeds maximum of 1024") + return False + + return True + + +def is_trait_combination_valid(reduce_op, output_index, propagate_nan, pooling_dim): + """ + Validate a pooling trait combination. + + Parameters: + reduce_op: "max" or "avg" + output_index: bool - whether to output indices + propagate_nan: bool - whether to propagate NaN + pooling_dim: "2d" or "3d" + """ + # output_index only makes sense for max pooling + if output_index and reduce_op != "max": + logging.debug("output_index is only supported for max pooling") + return False + + # Pooling dimension must be valid + if pooling_dim not in ("2d", "3d"): + logging.debug(f"Invalid pooling dimension: {pooling_dim}") + return False + + return True From 9bfcce5566dd81b5b54199d3f75d0e6bcac6eeb2 Mon Sep 17 00:00:00 2001 From: Aleksander Dudek Date: Tue, 10 Feb 2026 18:45:06 +0000 Subject: [PATCH 2/2] fix formating --- .../extract_test_params.py | 67 +++++--- .../test_pooling_simple.cpp | 145 +++++++++--------- tile_engine/ops/pooling/pooling_benchmark.hpp | 52 +++---- .../ops/pooling/pooling_benchmark_single.cpp | 97 ++++++------ tile_engine/ops/pooling/pooling_common.hpp | 25 ++- .../ops/pooling/pooling_instance_builder.py | 9 +- tile_engine/ops/pooling/pooling_profiler.hpp | 86 +++++------ 7 files changed, 246 insertions(+), 235 deletions(-) diff --git a/test/ck_tile/pooling_tile_engine/extract_test_params.py b/test/ck_tile/pooling_tile_engine/extract_test_params.py index dc93e55ee9c..86c809dd365 100644 --- a/test/ck_tile/pooling_tile_engine/extract_test_params.py +++ b/test/ck_tile/pooling_tile_engine/extract_test_params.py @@ -28,20 +28,36 @@ def extract_test_params(config_file, output_file, pooling_dim="2d"): # Default 2D test parameters test_params = [ { - "N": 1, "H": 8, "W": 8, "C": 32, - "Y": 2, "X": 2, - "stride_h": 2, "stride_w": 2, - "dilation_h": 1, "dilation_w": 1, - "pad_h_left": 0, "pad_h_right": 0, - "pad_w_left": 0, "pad_w_right": 0, + "N": 1, + "H": 8, + "W": 8, + "C": 32, + "Y": 2, + "X": 2, + "stride_h": 2, + "stride_w": 2, + "dilation_h": 1, + "dilation_w": 1, + "pad_h_left": 0, + "pad_h_right": 0, + "pad_w_left": 0, + "pad_w_right": 0, }, { - "N": 2, "H": 16, "W": 16, "C": 32, - "Y": 3, "X": 3, - "stride_h": 2, "stride_w": 2, - "dilation_h": 1, "dilation_w": 1, - "pad_h_left": 1, "pad_h_right": 1, - "pad_w_left": 1, "pad_w_right": 1, + "N": 2, + "H": 16, + "W": 16, + "C": 32, + "Y": 3, + "X": 3, + "stride_h": 2, + "stride_w": 2, + "dilation_h": 1, + "dilation_w": 1, + "pad_h_left": 1, + "pad_h_right": 1, + "pad_w_left": 1, + "pad_w_right": 1, }, ] else: # 3d @@ -51,13 +67,26 @@ def extract_test_params(config_file, output_file, pooling_dim="2d"): # Default 3D test parameters test_params = [ { - "N": 1, "D": 4, "H": 4, "W": 4, "C": 32, - "Z": 2, "Y": 2, "X": 2, - "stride_d": 2, "stride_h": 2, "stride_w": 2, - "dilation_d": 1, "dilation_h": 1, "dilation_w": 1, - "pad_d_left": 0, "pad_d_right": 0, - "pad_h_left": 0, "pad_h_right": 0, - "pad_w_left": 0, "pad_w_right": 0, + "N": 1, + "D": 4, + "H": 4, + "W": 4, + "C": 32, + "Z": 2, + "Y": 2, + "X": 2, + "stride_d": 2, + "stride_h": 2, + "stride_w": 2, + "dilation_d": 1, + "dilation_h": 1, + "dilation_w": 1, + "pad_d_left": 0, + "pad_d_right": 0, + "pad_h_left": 0, + "pad_h_right": 0, + "pad_w_left": 0, + "pad_w_right": 0, }, ] diff --git a/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp b/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp index 34ad3d79bb9..9ecf192eaa9 100644 --- a/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp +++ b/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp @@ -33,24 +33,24 @@ /// @brief Test parameters for 2D pooling struct PoolTestParams2D { - int N, H, W, C; // Input dimensions (NHWC) - int Y, X; // Window size - int stride_h, stride_w; // Strides - int dilation_h, dilation_w; // Dilations - int pad_h_left, pad_h_right; // Height padding - int pad_w_left, pad_w_right; // Width padding + int N, H, W, C; // Input dimensions (NHWC) + int Y, X; // Window size + int stride_h, stride_w; // Strides + int dilation_h, dilation_w; // Dilations + int pad_h_left, pad_h_right; // Height padding + int pad_w_left, pad_w_right; // Width padding }; /// @brief Test parameters for 3D pooling struct PoolTestParams3D { int N, D, H, W, C; // Input dimensions (NDHWC) - int Z, Y, X; // Window size - int stride_d, stride_h, stride_w; // Strides - int dilation_d, dilation_h, dilation_w; // Dilations - int pad_d_left, pad_d_right; // Depth padding - int pad_h_left, pad_h_right; // Height padding - int pad_w_left, pad_w_right; // Width padding + int Z, Y, X; // Window size + int stride_d, stride_h, stride_w; // Strides + int dilation_d, dilation_h, dilation_w; // Dilations + int pad_d_left, pad_d_right; // Depth padding + int pad_h_left, pad_h_right; // Height padding + int pad_w_left, pad_w_right; // Width padding }; // Include config-specific test parameters (after parameter structs are defined) @@ -67,17 +67,17 @@ class PoolingTileEngineTest2D : public ::testing::TestWithParam{ - d_in.GetDeviceBuffer(), - d_out.GetDeviceBuffer(), - d_out_index.GetDeviceBuffer(), - input_shape, - output_shape, - input_strides, - output_strides, - window_lengths, - window_strides, - window_dilations, - input_left_pads, - input_right_pads}; + auto host_args = ck_tile::PoolHostArgs{ + d_in.GetDeviceBuffer(), + d_out.GetDeviceBuffer(), + d_out_index.GetDeviceBuffer(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; // Stream config: no timing overhead for fastest execution ck_tile::stream_config stream_config{nullptr, false, 0, 0, 1, false, false, 1}; @@ -175,20 +174,19 @@ TEST_P(PoolingTileEngineTest2D, BasicFunctionality) d_out_index.FromDevice(h_out_index.data()); // Compute reference on host - auto kernel_args_ref = - ck_tile::PoolKernelArgs{ - h_in.data(), - h_out_ref.data(), - h_out_ref_index.data(), - input_shape, - output_shape, - input_strides, - output_strides, - window_lengths, - window_strides, - window_dilations, - input_left_pads, - input_right_pads}; + auto kernel_args_ref = ck_tile::PoolKernelArgs{ + h_in.data(), + h_out_ref.data(), + h_out_ref_index.data(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; ck_tile::reference_pool2d& param_info) { - return "N" + std::to_string(param_info.param.N) + "_H" + - std::to_string(param_info.param.H) + "_W" + - std::to_string(param_info.param.W) + "_C" + - std::to_string(param_info.param.C) + "_Y" + - std::to_string(param_info.param.Y) + "_X" + - std::to_string(param_info.param.X); - }); +INSTANTIATE_TEST_SUITE_P(PoolingVerification, + PoolingTileEngineTest2D, + ::testing::ValuesIn(CONFIG_TEST_PARAMS), + [](const ::testing::TestParamInfo& param_info) { + return "N" + std::to_string(param_info.param.N) + "_H" + + std::to_string(param_info.param.H) + "_W" + + std::to_string(param_info.param.W) + "_C" + + std::to_string(param_info.param.C) + "_Y" + + std::to_string(param_info.param.Y) + "_X" + + std::to_string(param_info.param.X); + }); diff --git a/tile_engine/ops/pooling/pooling_benchmark.hpp b/tile_engine/ops/pooling/pooling_benchmark.hpp index 8376571b4fa..705d725b10e 100644 --- a/tile_engine/ops/pooling/pooling_benchmark.hpp +++ b/tile_engine/ops/pooling/pooling_benchmark.hpp @@ -27,14 +27,14 @@ enum class PoolMetric /// @brief Pooling problem specification for 2D pooling struct PoolProblem2D { - index_t N, H, W, C; // Input dimensions (NHWC) - index_t Y, X; // Window dimensions - index_t stride_h, stride_w; // Window strides - index_t dilation_h, dilation_w; // Window dilations - index_t pad_h_left, pad_h_right; // Height padding - index_t pad_w_left, pad_w_right; // Width padding - std::string datatype; // Data type name - std::string reduce_op; // "max" or "avg" + index_t N, H, W, C; // Input dimensions (NHWC) + index_t Y, X; // Window dimensions + index_t stride_h, stride_w; // Window strides + index_t dilation_h, dilation_w; // Window dilations + index_t pad_h_left, pad_h_right; // Height padding + index_t pad_w_left, pad_w_right; // Width padding + std::string datatype; // Data type name + std::string reduce_op; // "max" or "avg" index_t Ho() const { @@ -54,9 +54,8 @@ struct PoolProblem2D std::string to_string() const { std::ostringstream oss; - oss << "N" << N << "_H" << H << "_W" << W << "_C" << C << "_Y" << Y << "_X" << X - << "_Sh" << stride_h << "_Sw" << stride_w << "_Dh" << dilation_h << "_Dw" - << dilation_w; + oss << "N" << N << "_H" << H << "_W" << W << "_C" << C << "_Y" << Y << "_X" << X << "_Sh" + << stride_h << "_Sw" << stride_w << "_Dh" << dilation_h << "_Dw" << dilation_w; if(pad_h_left > 0 || pad_w_left > 0) oss << "_Ph" << pad_h_left << "_Pw" << pad_w_left; return oss.str(); @@ -66,15 +65,15 @@ struct PoolProblem2D /// @brief Pooling problem specification for 3D pooling struct PoolProblem3D { - index_t N, D, H, W, C; // Input dimensions (NDHWC) - index_t Z, Y, X; // Window dimensions - index_t stride_d, stride_h, stride_w; // Window strides - index_t dilation_d, dilation_h, dilation_w; // Window dilations - index_t pad_d_left, pad_d_right; // Depth padding - index_t pad_h_left, pad_h_right; // Height padding - index_t pad_w_left, pad_w_right; // Width padding - std::string datatype; // Data type name - std::string reduce_op; // "max" or "avg" + index_t N, D, H, W, C; // Input dimensions (NDHWC) + index_t Z, Y, X; // Window dimensions + index_t stride_d, stride_h, stride_w; // Window strides + index_t dilation_d, dilation_h, dilation_w; // Window dilations + index_t pad_d_left, pad_d_right; // Depth padding + index_t pad_h_left, pad_h_right; // Height padding + index_t pad_w_left, pad_w_right; // Width padding + std::string datatype; // Data type name + std::string reduce_op; // "max" or "avg" index_t Do() const { @@ -100,8 +99,8 @@ struct PoolProblem3D std::string to_string() const { std::ostringstream oss; - oss << "N" << N << "_D" << D << "_H" << H << "_W" << W << "_C" << C << "_Z" << Z - << "_Y" << Y << "_X" << X; + oss << "N" << N << "_D" << D << "_H" << H << "_W" << W << "_C" << C << "_Z" << Z << "_Y" + << Y << "_X" << X; return oss.str(); } }; @@ -123,11 +122,10 @@ struct PoolPerformanceResult /// @brief Benchmark settings struct PoolBenchmarkSetting { - int warmup = 5; - int repeat = 20; - bool verify = true; - int init_method = - 0; // 0: uniform random, 1: integer sequence, 2: constant, 3: special + int warmup = 5; + int repeat = 20; + bool verify = true; + int init_method = 0; // 0: uniform random, 1: integer sequence, 2: constant, 3: special }; } // namespace ck_tile diff --git a/tile_engine/ops/pooling/pooling_benchmark_single.cpp b/tile_engine/ops/pooling/pooling_benchmark_single.cpp index 5a5cff0ea91..6c0bfead43e 100644 --- a/tile_engine/ops/pooling/pooling_benchmark_single.cpp +++ b/tile_engine/ops/pooling/pooling_benchmark_single.cpp @@ -55,25 +55,25 @@ int benchmark_pooling_single(int argc, char* argv[]) return -1; // Parse problem dimensions - ck_tile::index_t N = arg_parser.get_int("n"); - ck_tile::index_t H = arg_parser.get_int("h"); - ck_tile::index_t W = arg_parser.get_int("w"); - ck_tile::index_t C = arg_parser.get_int("c"); - ck_tile::index_t Y = arg_parser.get_int("wy"); - ck_tile::index_t X = arg_parser.get_int("wx"); - ck_tile::index_t Sy = arg_parser.get_int("sy"); - ck_tile::index_t Sx = arg_parser.get_int("sx"); - ck_tile::index_t Dy = arg_parser.get_int("dy"); - ck_tile::index_t Dx = arg_parser.get_int("dx"); + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t H = arg_parser.get_int("h"); + ck_tile::index_t W = arg_parser.get_int("w"); + ck_tile::index_t C = arg_parser.get_int("c"); + ck_tile::index_t Y = arg_parser.get_int("wy"); + ck_tile::index_t X = arg_parser.get_int("wx"); + ck_tile::index_t Sy = arg_parser.get_int("sy"); + ck_tile::index_t Sx = arg_parser.get_int("sx"); + ck_tile::index_t Dy = arg_parser.get_int("dy"); + ck_tile::index_t Dx = arg_parser.get_int("dx"); ck_tile::index_t LeftPy = arg_parser.get_int("phy"); ck_tile::index_t RightPy = arg_parser.get_int("phyr"); ck_tile::index_t LeftPx = arg_parser.get_int("pwx"); ck_tile::index_t RightPx = arg_parser.get_int("pwxr"); - bool verify = arg_parser.get_int("verify") != 0; - int warmup = arg_parser.get_int("warmup"); - int repeat = arg_parser.get_int("repeat"); - int log_level = arg_parser.get_int("log"); + bool verify = arg_parser.get_int("verify") != 0; + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + int log_level = arg_parser.get_int("log"); // Calculate output dimensions ck_tile::index_t Ys = (Y - 1) * Dy + 1; @@ -106,30 +106,29 @@ int benchmark_pooling_single(int argc, char* argv[]) d_out_index.SetZero(); // Build host args - const auto input_shape = ck_tile::make_tuple(N, H, W, C); - const auto output_shape = ck_tile::make_tuple(N, Ho, Wo, C); - const auto input_strides = ck_tile::make_tuple(H * W * C, W * C, C, 1); - const auto output_strides = ck_tile::make_tuple(Ho * Wo * C, Wo * C, C, 1); + const auto input_shape = ck_tile::make_tuple(N, H, W, C); + const auto output_shape = ck_tile::make_tuple(N, Ho, Wo, C); + const auto input_strides = ck_tile::make_tuple(H * W * C, W * C, C, 1); + const auto output_strides = ck_tile::make_tuple(Ho * Wo * C, Wo * C, C, 1); const auto window_lengths = ck_tile::make_tuple(Y, X); const auto window_strides = ck_tile::make_tuple(Sy, Sx); const auto window_dilations = ck_tile::make_tuple(Dy, Dx); const auto input_left_pads = ck_tile::make_tuple(LeftPy, LeftPx); const auto input_right_pads = ck_tile::make_tuple(RightPy, RightPx); - auto host_args = - ck_tile::PoolHostArgs{ - d_in.GetDeviceBuffer(), - d_out.GetDeviceBuffer(), - d_out_index.GetDeviceBuffer(), - input_shape, - output_shape, - input_strides, - output_strides, - window_lengths, - window_strides, - window_dilations, - input_left_pads, - input_right_pads}; + auto host_args = ck_tile::PoolHostArgs{ + d_in.GetDeviceBuffer(), + d_out.GetDeviceBuffer(), + d_out_index.GetDeviceBuffer(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; // Stream configuration ck_tile::stream_config stream{nullptr, true, log_level, warmup, repeat}; @@ -160,20 +159,19 @@ int benchmark_pooling_single(int argc, char* argv[]) d_out.FromDevice(h_out.data()); d_out_index.FromDevice(h_out_index.data()); - auto kernel_args = - ck_tile::PoolKernelArgs{ - h_in.data(), - h_out_ref.data(), - h_out_ref_index.data(), - input_shape, - output_shape, - input_strides, - output_strides, - window_lengths, - window_strides, - window_dilations, - input_left_pads, - input_right_pads}; + auto kernel_args = ck_tile::PoolKernelArgs{ + h_in.data(), + h_out_ref.data(), + h_out_ref_index.data(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; ck_tile::reference_pool2d" + ) window_rank = 3 pragma_line = "#pragma once\n" if is_header else "" diff --git a/tile_engine/ops/pooling/pooling_profiler.hpp b/tile_engine/ops/pooling/pooling_profiler.hpp index d98fd1f59c2..37eb8ab2eb8 100644 --- a/tile_engine/ops/pooling/pooling_profiler.hpp +++ b/tile_engine/ops/pooling/pooling_profiler.hpp @@ -19,7 +19,10 @@ namespace ck_tile { /// /// Handles tensor setup, kernel launch, reference computation, and verification /// for 2D pooling benchmarks. -template +template class PoolProfiler2D { public: @@ -56,34 +59,30 @@ class PoolProfiler2D d_out_index.SetZero(); // Build kernel args - const auto input_shape = - make_tuple(problem.N, problem.H, problem.W, problem.C); + const auto input_shape = make_tuple(problem.N, problem.H, problem.W, problem.C); const auto output_shape = make_tuple(problem.N, Ho, Wo, problem.C); - const auto input_strides = make_tuple( - problem.H * problem.W * problem.C, problem.W * problem.C, problem.C, 1); - const auto output_strides = - make_tuple(Ho * Wo * problem.C, Wo * problem.C, problem.C, 1); - const auto window_lengths = make_tuple(problem.Y, problem.X); - const auto window_strides = make_tuple(problem.stride_h, problem.stride_w); + const auto input_strides = + make_tuple(problem.H * problem.W * problem.C, problem.W * problem.C, problem.C, 1); + const auto output_strides = make_tuple(Ho * Wo * problem.C, Wo * problem.C, problem.C, 1); + const auto window_lengths = make_tuple(problem.Y, problem.X); + const auto window_strides = make_tuple(problem.stride_h, problem.stride_w); const auto window_dilations = make_tuple(problem.dilation_h, problem.dilation_w); - const auto input_left_pads = make_tuple(problem.pad_h_left, problem.pad_w_left); - const auto input_right_pads = - make_tuple(problem.pad_h_right, problem.pad_w_right); - - auto host_args = - PoolHostArgs{ - d_in.GetDeviceBuffer(), - d_out.GetDeviceBuffer(), - d_out_index.GetDeviceBuffer(), - input_shape, - output_shape, - input_strides, - output_strides, - window_lengths, - window_strides, - window_dilations, - input_left_pads, - input_right_pads}; + const auto input_left_pads = make_tuple(problem.pad_h_left, problem.pad_w_left); + const auto input_right_pads = make_tuple(problem.pad_h_right, problem.pad_w_right); + + auto host_args = PoolHostArgs{ + d_in.GetDeviceBuffer(), + d_out.GetDeviceBuffer(), + d_out_index.GetDeviceBuffer(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; // Launch kernel float latency = kernel_func(host_args); @@ -95,20 +94,19 @@ class PoolProfiler2D // Verify if requested if(setting_.verify) { - auto kernel_args_ref = - PoolKernelArgs{ - h_in.data(), - h_out_ref.data(), - h_out_ref_index.data(), - input_shape, - output_shape, - input_strides, - output_strides, - window_lengths, - window_strides, - window_dilations, - input_left_pads, - input_right_pads}; + auto kernel_args_ref = PoolKernelArgs{ + h_in.data(), + h_out_ref.data(), + h_out_ref_index.data(), + input_shape, + output_shape, + input_strides, + output_strides, + window_lengths, + window_strides, + window_dilations, + input_left_pads, + input_right_pads}; // Use ReduceOp::Max as default for reference using ReduceOp = ReduceOp::Max; @@ -119,11 +117,9 @@ class PoolProfiler2D ReduceOp, decltype(input_shape), decltype(window_lengths), - true>( - h_in, h_out_ref, h_out_ref_index, kernel_args_ref, ReduceOp{}); + true>(h_in, h_out_ref, h_out_ref_index, kernel_args_ref, ReduceOp{}); - bool pass = - check_err(h_out, h_out_ref, "Error: Incorrect results!", 1e-3, 1e-3); + bool pass = check_err(h_out, h_out_ref, "Error: Incorrect results!", 1e-3, 1e-3); if(!pass) { std::cerr << "Verification FAILED!" << std::endl;