diff --git a/gmm2.py b/gmm2.py new file mode 100644 index 000000000..938d69247 --- /dev/null +++ b/gmm2.py @@ -0,0 +1,90 @@ +import os +import time +import torch +import transformer_engine.pytorch as te + +torch.manual_seed(0) + +os.environ["NVTE_USE_CUTLASS_GROUPED_GEMM"] = "1" +os.environ["NVTE_CUTLASS_GROUPED_GEMM_WARN_FALLBACK"] = "1" + +device = "cuda" +dtype = torch.bfloat16 + +E = 4 +K = 1024 +N = 2048 +m_splits = [128, 64, 0, 256] +M_total = sum(m_splits) + +x = torch.randn(M_total, K, device=device, dtype=dtype) + +# Timing helper +def bench_cuda(fn, warmup=20, iters=100): + # Warmup + for _ in range(warmup): + fn() + torch.cuda.synchronize() + + # Timed + start = time.time() + for _ in range(iters): + fn() + torch.cuda.synchronize() + end = time.time() + + avg_ms = (end - start) * 1000.0 / iters + return avg_ms + +# TE GroupedLinear +glinear = te.GroupedLinear(E, K, N, bias=False).to(device=device, dtype=dtype) + +def te_run(): + return glinear(x, m_splits=m_splits) + +te_ms = bench_cuda(te_run, warmup=20, iters=100) + +# Grab weights for reference path +Ws = [getattr(glinear, f"weight{e}") for e in range(E)] # each [N, K] +W = torch.stack(Ws, dim=0) # [E, N, K] +assert W.shape == (E, N, K), f"Unexpected weight shape: {W.shape}" + +# Torch reference (group loop) +offsets = [] +off = 0 +for m in m_splits: + offsets.append(off) + off += m + +y_ref_buf = torch.empty((M_total, N), device=device, dtype=dtype) + +def torch_run(): + # Fill the preallocated buffer + for e, m in enumerate(m_splits): + if m == 0: + continue + o = offsets[e] + y_ref_buf[o:o+m].copy_(x[o:o+m] @ W[e].transpose(0, 1)) + return y_ref_buf + +torch_ms = bench_cuda(torch_run, warmup=20, iters=100) + +# Compare outputs +y_te = te_run() +y_ref = torch_run().clone() + +diff = (y_te.float() - y_ref.float()) +max_abs = diff.abs().max().item() +rel = (diff.abs() / (y_ref.float().abs() + 1e-6)).max().item() + +print(f"Errors:") +print(f" {y_te.shape=}, {y_ref.shape=}") +print(" max_abs_err:", max_abs) +print(" max_rel_err:", rel) + +torch.testing.assert_close(y_te.float(), y_ref.float(), rtol=3e-2, atol=3e-2) + +print(f"\nTiming:") +print(f" TE avg: {te_ms:.3f} ms") +print(f" Torch avg: {torch_ms:.3f} ms") +print(f" Speedup: {torch_ms/te_ms:.2f}x (Torch / TE)") diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index bc29d29e3..bd71e0a50 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -28,7 +28,7 @@ is_bf16_compatible, ) if IS_HIP_EXTENSION: - from transformer_engine.pytorch.utils import is_mi200, is_mi308 + from transformer_engine.pytorch.utils import is_mi200, is_mi308, is_mi300_class from transformer_engine.pytorch import ( DotProductAttention, @@ -148,7 +148,7 @@ def rocm_attn_backend() -> tuple[bool, bool, bool]: use_cutlass_grouped_gemm = [False] # Only enable cutlass grouped gemm on Hopper -if torch.cuda.get_device_capability() == (9, 0): +if torch.cuda.get_device_capability() == (9, 0) or IS_HIP_EXTENSION: use_cutlass_grouped_gemm.append(True) @@ -1386,7 +1386,7 @@ def test_linear_accuracy_delay_wgrad_compute(dtype, bs, model, bias, fuse_wgrad_ if IS_HIP_EXTENSION: if dtype not in (torch.float32,) and fuse_wgrad_accumulation and bias: - pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.") + pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.") te_linear_ref = Linear( config.hidden_size, @@ -1678,7 +1678,7 @@ def test_layernorm_linear_accuracy_delay_wgrad_compute( ): if IS_HIP_EXTENSION: if dtype not in (torch.float32,) and fuse_wgrad_accumulation and bias: - pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.") + pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.") config = model_configs[model] ln_linear_ref = LayerNormLinear( @@ -1892,7 +1892,7 @@ def test_layernorm_mlp_accuracy_delay_wgrad_compute( if IS_HIP_EXTENSION: if dtype not in (torch.float32,) and fuse_wgrad_accumulation and bias: - pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.") + pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.") ln_mlp = LayerNormMLP( hidden_size=config.hidden_size, @@ -2042,7 +2042,7 @@ def test_grouped_linear_accuracy( if IS_HIP_EXTENSION: if dtype not in (torch.float32,) and fuse_wgrad_accumulation and not fp8: - pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.") + pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.") if fp8 and fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED: pytest.skip("FP8 parameters are not supported in debug mode.") @@ -2121,6 +2121,8 @@ def test_grouped_linear_accuracy( atol, rtol = 0, 0 if use_cutlass: atol, rtol = 1e-3, 1e-3 + if IS_HIP_EXTENSION and is_mi300_class(): + atol, rtol = 3e-2, 3e-2 if use_triton: atol, rtol = get_tolerances(dtype) if dtype == torch.float32: @@ -2131,7 +2133,7 @@ def test_grouped_linear_accuracy( @pytest.mark.skipif( - torch.cuda.get_device_capability() != (9, 0), + torch.cuda.get_device_capability() != (9, 0) and not IS_HIP_EXTENSION, reason="Only enable CUTLASS grouped gemm on Hopper", ) @pytest.mark.parametrize("dtype", param_types, ids=str) @@ -2936,7 +2938,10 @@ def test_grouped_gemm(shape, dtype, layout, accumulate, use_cutlass): # cublas implementation should be bit-wise match torch.testing.assert_close(o, o_ref, rtol=0, atol=0) else: - torch.testing.assert_close(o, o_ref, rtol=1.5e-2, atol=1.5e-2) + if IS_HIP_EXTENSION and is_mi300_class(): + torch.testing.assert_close(o, o_ref, rtol=2.0e-2, atol=3.0e-2) + else: + torch.testing.assert_close(o, o_ref, rtol=1.5e-2, atol=1.5e-2) if use_cutlass: os.environ.pop("NVTE_USE_CUTLASS_GROUPED_GEMM", None) diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index cefec6d06..4a04a630f 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -203,6 +203,7 @@ else() fused_attn_rocm/fused_attn_ck.cpp fused_attn_rocm/utils.cpp gemm/rocm_gemm.cu + gemm/ck_grouped_gemm.cpp amd_detail/system.cpp) # process source code files @@ -251,6 +252,9 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) else() message(FATAL_ERROR "cutlass gemm/cutlass_grouped_gemm.cu kernel required sm 90a") endif() +else() + set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/aiter/3rdparty/composable_kernel) + target_include_directories(transformer_engine PRIVATE ${CK_ROOT}/include) endif() #USE_CUDA # Configure dependencies diff --git a/transformer_engine/common/gemm/ck_grouped_gemm.cpp b/transformer_engine/common/gemm/ck_grouped_gemm.cpp new file mode 100644 index 000000000..4c3e0264a --- /dev/null +++ b/transformer_engine/common/gemm/ck_grouped_gemm.cpp @@ -0,0 +1,273 @@ +/* Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. */ + +#include + +#include +#include "../common.h" + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" + +using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; +using ColMajor = ck_tile::tensor_layout::gemm::ColumnMajor; + +template struct TeTypeToCkType; +template <> struct TeTypeToCkType { using type = ck_tile::half_t; }; +template <> struct TeTypeToCkType { using type = ck_tile::bfloat16_t; }; + +// Treat TE tensors as generalized 2D matrices by flattening: +// (D1, D2, ..., Dn) -> (D1*...*D(n-1), Dn), consistent with TE Tensor::flat_*_dim. +static inline bool get_flat_2d_dims(const transformer_engine::Tensor& t, + int64_t& d0, int64_t& d1) { + // Require at least a matrix (rank >= 2). Higher ranks are flattened. + if (t.shape().size() < 2) + return false; + d0 = static_cast(t.flat_first_dim()); + d1 = static_cast(t.flat_last_dim()); + return true; +} + +static inline const transformer_engine::SimpleTensor& data_view(const transformer_engine::Tensor& t) { + return t.data; // rowwise data view +} + +struct TileCfg_basic { + static constexpr ck_tile::index_t M_Tile = 256; + static constexpr ck_tile::index_t N_Tile = 128; + static constexpr ck_tile::index_t K_Tile = 64; + + static constexpr ck_tile::index_t M_Warp = 2; + static constexpr ck_tile::index_t N_Warp = 2; + static constexpr ck_tile::index_t K_Warp = 1; + + static constexpr ck_tile::index_t M_Warp_Tile = 32; + static constexpr ck_tile::index_t N_Warp_Tile = 32; + static constexpr ck_tile::index_t K_Warp_Tile = 16; + + static constexpr bool kPadM = true; + static constexpr bool kPadN = true; + static constexpr bool kPadK = true; + + static constexpr bool DoubleSmemBuffer = false; + + static constexpr ck_tile::index_t TilePartitionerGroupNum = 8; + static constexpr ck_tile::index_t TilePartitionerM01 = 1; +}; + +// This class instantiates CK_Tile's grouped GEMM pipeline. +// See e.g. https://github.com/ROCm/composable_kernel/blob/develop/example/ck_tile/03_gemm/universal_gemm_invoker.hpp for reference. +template +struct Runner{ + using GemmShape = ck_tile::TileGemmShape< + ck_tile::sequence, + ck_tile::sequence, + ck_tile::sequence>; + + using Partitioner = ck_tile::GemmSpatiallyLocalTilePartitioner< + GemmShape, TileCfg::TilePartitionerGroupNum, TileCfg::TilePartitionerM01>; + + using UniversalTraits = ck_tile::PersistentTileGemmUniversalTraits< + TileCfg::kPadM, TileCfg::kPadN, TileCfg::kPadK, + TileCfg::DoubleSmemBuffer, ALayout, BLayout, CLayout>; + + static constexpr ck_tile::GemmPipelineScheduler Scheduler = + ck_tile::GemmPipelineScheduler::Intrawave; + + using Problem = ck_tile::UniversalGemmPipelineProblem< + AType, BType, AccType, GemmShape, UniversalTraits, Scheduler>; + + using Pipeline = ck_tile::GemmPipelineAgBgCrCompV3; + + using Epilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem< + AType, BType, ck_tile::tuple<>, AccType, + CType, ck_tile::tuple<>, CLayout, + ck_tile::element_wise::PassThrough, + Partitioner::MPerBlock, Partitioner::NPerBlock, + TileCfg::M_Warp, TileCfg::N_Warp, + TileCfg::M_Warp_Tile, TileCfg::N_Warp_Tile, TileCfg::K_Warp_Tile, + Problem::TransposeC, MemOp>>; + + using Kernel = ck_tile::GroupedGemmKernel; +}; + +template +static bool run_grouped_impl(const transformer_engine::Tensor* const* A_use, + const transformer_engine::Tensor* const* B_use, + transformer_engine::Tensor* const* D, + int group_num, + bool transA_use, + bool transB_use, + void* workspace, + size_t workspace_bytes, + hipStream_t stream) +{ + using Kernel = typename Runner::Kernel; + + const size_t needed = Kernel::GetWorkSpaceSize(group_num); + if (!workspace || workspace_bytes < needed) { + NVTE_ERROR("ck_tile_grouped_gemm: insufficient workspace. Needed bytes=", needed); + return false; + } + + std::vector> descs; + descs.reserve(group_num); + + for (int i = 0; i < group_num; ++i) { + const auto& a = data_view(*A_use[i]); + const auto& b = data_view(*B_use[i]); + const auto& d = data_view(*D[i]); + + int64_t Ad0 = 0, Ad1 = 0, Bd0 = 0, Bd1 = 0, Dd0 = 0, Dd1 = 0; + if (!get_flat_2d_dims(*A_use[i], Ad0, Ad1) || + !get_flat_2d_dims(*B_use[i], Bd0, Bd1) || + !get_flat_2d_dims(*D[i], Dd0, Dd1)) { + NVTE_ERROR("ck_tile_grouped_gemm: expected all groups to be rank>=2 (2D or higher)."); + return false; + } + + const int64_t M = transA_use ? Ad1 : Ad0; + const int64_t K = transA_use ? Ad0 : Ad1; + const int64_t N = transB_use ? Bd0 : Bd1; + const int64_t Kb = transB_use ? Bd1 : Bd0; + + if (Kb != K) { + NVTE_ERROR("ck_tile_grouped_gemm: K mismatch between A and B in group ", i); + return false; + } + + if (Dd0 != M || Dd1 != N) { + NVTE_ERROR("ck_tile_grouped_gemm: D shape mismatch in group ", i); + return false; + } + + // Leading dimensions under the flattened-contiguous interpretation + const ck_tile::index_t stride_A = Ad1; + const ck_tile::index_t stride_B = Bd1; + const ck_tile::index_t stride_E = Dd1; + + descs.emplace_back( + a.dptr, + b.dptr, + std::array{}, + d.dptr, + 1, + M, + N, + K, + stride_A, + stride_B, + std::array{}, + stride_E); + } + + const dim3 grids = Kernel::GridSize(descs); + auto kargs = Kernel::MakeKargs(descs); + if (!Kernel::IsSupportedArgument(kargs)) { + NVTE_ERROR("ck_tile_grouped_gemm: CK_Tile kernel arguments not supported for this config."); + return false; + } + + HIP_CHECK_ERROR(hipMemcpyAsync(workspace, + kargs.data(), + kargs.size() * sizeof(typename decltype(kargs)::value_type), + hipMemcpyHostToDevice, + stream)); + + const ck_tile::stream_config s{stream}; + const dim3 blocks = Kernel::BlockSize(); + + ck_tile::launch_kernel( + s, + ck_tile::make_kernel<1>( + Kernel{}, grids, blocks, 0, + ck_tile::cast_pointer_to_constant_address_space(workspace), + group_num)); + return true; +} + +template +static inline bool dispatch_grouped(bool transA_use, + bool transB_use, + const transformer_engine::Tensor* const* A_use, + const transformer_engine::Tensor* const* B_use, + transformer_engine::Tensor* const* D, + int group_num, + void* workspace, + size_t workspace_bytes, + hipStream_t stream) { + +// FIXME: This could be a templated lambda function in C++20. +#define CALL(ALayout_, BLayout_, ta_, tb_) \ + return run_grouped_impl( \ + A_use, B_use, D, group_num, (ta_), (tb_), workspace, workspace_bytes, stream) + + if (!transA_use && !transB_use) { CALL(RowMajor, RowMajor, false, false); } + if (!transA_use && transB_use) { CALL(RowMajor, ColMajor, false, true ); } + if ( transA_use && !transB_use) { CALL(ColMajor, RowMajor, true, false); } + /* transA_use && transB_use */ { CALL(ColMajor, ColMajor, true, true ); } + +#undef CALL +} + +bool ck_tile_grouped_gemm(const NVTETensor* A, + const NVTETensor* B, + NVTETensor* D, + int group_num, + bool transA, + bool transB, + NVTETensor* workspace, + bool accumulate, + hipStream_t stream) +{ + if (group_num <= 0) + return true; + + // Convert A/B/D arrays into TE Tensor arrays + std::vector A_te(group_num); + std::vector B_te(group_num); + std::vector D_te(group_num); + + for (int i = 0; i < group_num; ++i) { + A_te[i] = transformer_engine::convertNVTETensorCheck(A[i]); + B_te[i] = transformer_engine::convertNVTETensorCheck(B[i]); + D_te[i] = transformer_engine::convertNVTETensorCheck(D[i]); + } + + // Workspace pointer + bytes + void* ws_ptr = nullptr; + size_t ws_bytes = 0; + if (workspace) { + auto* ws_te = transformer_engine::convertNVTETensorCheck(*workspace); + ws_ptr = ws_te->data.dptr; + ws_bytes = ws_te->data.numel() * + transformer_engine::typeToSize(ws_te->data.dtype); + } + + // Normalize similar to upstream + // See https://github.com/NVIDIA/TransformerEngine/blob/59f6f3876767d07045152bfae07b5dd4c54e1725/transformer_engine/common/gemm/cutlass_grouped_gemm.cu#L54-L68 + const transformer_engine::Tensor* const* A_use = B_te.data(); + const transformer_engine::Tensor* const* B_use = A_te.data(); + const bool transA_use = transB; + const bool transB_use = transA; + + const auto a_dtype = A_use[0]->dtype(); + + TRANSFORMER_ENGINE_TYPE_SWITCH_16BIT(a_dtype, te_type, { + using T = typename TeTypeToCkType::type; + + if (accumulate) + return dispatch_grouped(transA_use, transB_use, + A_use, B_use, D_te.data(), group_num, + ws_ptr, ws_bytes, stream); + else + return dispatch_grouped(transA_use, transB_use, + A_use, B_use, D_te.data(), group_num, + ws_ptr, ws_bytes, stream); + }); +} diff --git a/transformer_engine/common/gemm/ck_grouped_gemm.h b/transformer_engine/common/gemm/ck_grouped_gemm.h new file mode 100644 index 000000000..d539b47f7 --- /dev/null +++ b/transformer_engine/common/gemm/ck_grouped_gemm.h @@ -0,0 +1,11 @@ +/* Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. */ + +bool ck_tile_grouped_gemm(const NVTETensor* A, + const NVTETensor* B, + NVTETensor* D, + int group_num, + bool transA, + bool transB, + NVTETensor* workspace, + bool accumulate, + hipStream_t stream); diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 9c2ca9b4c..bb322233b 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1,6 +1,6 @@ /************************************************************************* * This file was modified for portability to AMDGPU - * Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE for license information. @@ -24,8 +24,11 @@ #include "../util/logging.h" #include "../util/multi_stream.h" #include "common/util/cuda_runtime.h" +#include "common/util/system.h" #ifndef __HIP_PLATFORM_AMD__ #include "cutlass_grouped_gemm.cuh" +#else +#include "ck_grouped_gemm.h" #endif #ifndef __HIP_PLATFORM_AMD__ @@ -788,9 +791,10 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor NVTE_API_CALL(nvte_multi_tensor_gemm); #ifdef __HIP_PLATFORM_AMD__ - multi_stream_cublas_gemm(A, B, D, bias, pre_gelu_out, num_gemms, transa, transb, grad, - workspace, accumulate, use_split_accumulator, math_sm_count, stream); -#else + if (num_gemms <= 0) + return; +#endif + const int current_device = transformer_engine::cuda::current_device(); const bool is_hopper = (transformer_engine::cuda::sm_arch(current_device) == 90); const bool use_cutlass = transformer_engine::getenv("NVTE_USE_CUTLASS_GROUPED_GEMM", false); @@ -803,7 +807,11 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor }; // Currently only support cutlass group gemm on Hopper Arch +#ifdef __HIP_PLATFORM_AMD__ + if (!use_cutlass || num_gemms == 1) { +#else if (!(is_hopper && use_cutlass)) { +#endif cublas_path(); return; } @@ -837,12 +845,21 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor auto *inputA = transformer_engine::convertNVTETensorCheck(A[0]); auto *inputB = transformer_engine::convertNVTETensorCheck(B[0]); auto *OutputD = transformer_engine::convertNVTETensorCheck(D[0]); +#ifdef __HIP_PLATFORM_AMD__ + auto A_dt = inputA->data.dtype; + auto B_dt = inputB->data.dtype; + auto D_dt = OutputD->data.dtype; + return (A_dt == B_dt) && (A_dt == D_dt) && + (A_dt == transformer_engine::DType::kFloat16 || + A_dt == transformer_engine::DType::kBFloat16); +#else auto A_type = get_cuda_dtype(inputA->data.dtype); auto B_type = get_cuda_dtype(inputB->data.dtype); auto D_type = get_cuda_dtype(OutputD->data.dtype); return (A_type == B_type) && (A_type == D_type) && ((A_type == CUDA_R_16BF) || (A_type == CUDA_R_16F)); +#endif }; // CUTLASS Grouped GEMM fast path (SM90/TMA) @@ -855,14 +872,18 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor // // Otherwise, fall back to cuBLAS. if (is_empty_arr(bias) && is_empty_arr(pre_gelu_out) && is_supported_dtype() && +#ifdef __HIP_PLATFORM_AMD__ + true) { + ck_tile_grouped_gemm(A, B, D, num_gemms, transa, transb, workspace, accumulate, stream); +#else all_groups_uniform_k128(B, transb)) { cutlass_grouped_gemm(A, B, D, num_gemms, transa, transb, grad, workspace, accumulate, current_device, math_sm_count, stream); +#endif } else { if (warn_fallback) { NVTE_WARN("Fallback to cuBLAS grouped GEMM."); } cublas_path(); } -#endif // __HIP_PLATFORM_AMD__ } diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py index d124fbeaf..92253ad9c 100644 --- a/transformer_engine/pytorch/utils.py +++ b/transformer_engine/pytorch/utils.py @@ -1,5 +1,5 @@ # This file was modified for portability to AMDGPU -# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # See LICENSE for license information. @@ -456,6 +456,14 @@ def is_mi308(): import re return (re.search('AMD Instinct MI308', torch.cuda.get_device_name(torch.cuda.current_device())) is not None) + def is_mi300_class(): + """check whether the current device is of the gfx942 class""" + return get_device_compute_capability() == (9, 4) + + def is_mi350_class(): + """check whether the current device is of the gfx950 class""" + return get_device_compute_capability() == (9, 5) + @functools.lru_cache(maxsize=None) def is_fp8_fnuz(): return IS_HIP_EXTENSION and get_device_compute_capability() == (9, 4)