diff --git a/kernel_perf_agent/__init__.py b/kernel_perf_agent/__init__.py index 1f49766..47bb96e 100644 --- a/kernel_perf_agent/__init__.py +++ b/kernel_perf_agent/__init__.py @@ -14,5 +14,4 @@ """Kernel Performance Agent package.""" -# "Kernel Performance Agent package __all__ = [] diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py new file mode 100644 index 0000000..d8db477 --- /dev/null +++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Diagnose Prompt Module for Hardware Bottleneck Analysis. + +""" + +__all__: list[str] = [] diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py new file mode 100644 index 0000000..48351a6 --- /dev/null +++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py @@ -0,0 +1,95 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +GPU Specifications Database for Bottleneck Analysis + +This module provides GPU hardware specifications needed for performance analysis +and bottleneck identification. It includes peak compute performance, memory bandwidth, +cache sizes, and SM counts for common NVIDIA GPUs. + +""" + +import logging +from typing import Any + +from kernel_perf_agent.kernel_opt.diagnose_prompt.gpu_specs_database import ( + GPU_SPECS_DATABASE, +) + +__all__ = ["GPU_SPECS_DATABASE", "get_gpu_specs"] + +logger = logging.getLogger(__name__) + + +def get_gpu_specs(gpu_name: str) -> dict[str, Any] | None: + """ + Get GPU specifications for bottleneck analysis. + + This function returns hardware specifications needed for performance analysis, + including peak compute performance, memory bandwidth, cache sizes, and SM counts. + + Args: + gpu_name: GPU name. Must exactly match a key in GPU_SPECS_DATABASE. + + Returns: + Dictionary with GPU specifications, or None if GPU is not in the database. + When successful, contains: + - name: GPU name + - architecture: GPU architecture (e.g., "Ampere", "Hopper") + - peak_fp32_tflops: Peak FP32 compute performance in TFLOPS + - peak_fp16_tflops: Peak FP16 compute performance in TFLOPS + - peak_bf16_tflops: Peak BF16 compute performance in TFLOPS (0 if not supported) + - peak_memory_bw_gbps: Peak memory bandwidth in GB/s + - sm_count: Number of streaming multiprocessors + - max_threads_per_sm: Maximum threads per SM + - l1_cache_kb: L1 cache size in KB per SM + - l2_cache_mb: Total L2 cache size in MB + - memory_gb: Total GPU memory in GB + - memory_type: Memory type (e.g., "HBM2e", "GDDR6X") + + Examples: + >>> specs = get_gpu_specs("NVIDIA A100") + >>> if specs: + ... print(f"SM Count: {specs['sm_count']}") + """ + if gpu_name in GPU_SPECS_DATABASE: + return GPU_SPECS_DATABASE[gpu_name].copy() + + logger.warning( + "Unknown GPU: '%s'. Disable Optimization. Available GPUs: %s", + gpu_name, + ", ".join(GPU_SPECS_DATABASE.keys()), + ) + return None + + +if __name__ == "__main__": + print("GPU Specifications Module") + print("=" * 60) + + # Show all available GPUs + print("Available GPU specifications in database:") + for gpu_name in sorted(GPU_SPECS_DATABASE.keys()): + print(f" - {gpu_name}") + + # Example usage + print(f"\n{'=' * 60}") + example_gpu = "NVIDIA A100" + specs = get_gpu_specs(example_gpu) + if specs: + print(f"\nExample specs for {example_gpu}:") + print(f" - Peak Memory Bandwidth: {specs['peak_memory_bw_gbps']} GB/s") + print(f" - Peak FP32 Performance: {specs['peak_fp32_tflops']} TFLOPS") + print(f" - SM Count: {specs['sm_count']}") diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py new file mode 100644 index 0000000..cbc616d --- /dev/null +++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +GPU Specifications Database - Updated with Specific SKUs + +This module contains the GPU hardware specifications database used for +performance analysis and bottleneck identification. Updated to include +specific SKU variants for multi-SKU GPUs like A100 and H100. + +Sources: +- NVIDIA official specifications and datasheets +- TechPowerUp GPU Database +- Manufacturer datasheets + +Last Updated: January 2026 +""" + +GPU_SPECS_DATABASE: dict[str, dict[str, object]] = { + # NVIDIA A100 SKUs - SXM4 Variants + "NVIDIA A100 SXM4 40GB": { + "name": "NVIDIA A100 SXM4 40GB", + "architecture": "Ampere", + "peak_fp32_tflops": 19.5, + "peak_fp16_tflops": 312.0, # Without sparsity + "peak_bf16_tflops": 312.0, # Without sparsity + "peak_memory_bw_gbps": 1555, + "sm_count": 108, + "max_threads_per_sm": 2048, + "l1_cache_kb": 192, + "l2_cache_mb": 40, + "memory_gb": 40, + "memory_type": "HBM2e", + "form_factor": "SXM4", + "tdp_w": 400, + }, + "NVIDIA A100 SXM4 80GB": { + "name": "NVIDIA A100 SXM4 80GB", + "architecture": "Ampere", + "peak_fp32_tflops": 19.5, + "peak_fp16_tflops": 312.0, # Without sparsity + "peak_bf16_tflops": 312.0, # Without sparsity + "peak_memory_bw_gbps": 2039, + "sm_count": 108, + "max_threads_per_sm": 2048, + "l1_cache_kb": 192, + "l2_cache_mb": 40, + "memory_gb": 80, + "memory_type": "HBM2e", + "form_factor": "SXM4", + "tdp_w": 400, + }, + # NVIDIA A100 SKUs - PCIe Variants + "NVIDIA A100 PCIe 40GB": { + "name": "NVIDIA A100 PCIe 40GB", + "architecture": "Ampere", + "peak_fp32_tflops": 19.5, + "peak_fp16_tflops": 312.0, # Without sparsity + "peak_bf16_tflops": 312.0, # Without sparsity + "peak_memory_bw_gbps": 1555, + "sm_count": 108, + "max_threads_per_sm": 2048, + "l1_cache_kb": 192, + "l2_cache_mb": 40, + "memory_gb": 40, + "memory_type": "HBM2e", + "form_factor": "PCIe", + "tdp_w": 250, + }, + "NVIDIA A100 PCIe 80GB": { + "name": "NVIDIA A100 PCIe 80GB", + "architecture": "Ampere", + "peak_fp32_tflops": 19.5, + "peak_fp16_tflops": 312.0, # Without sparsity + "peak_bf16_tflops": 312.0, # Without sparsity + "peak_memory_bw_gbps": 1935, + "sm_count": 108, + "max_threads_per_sm": 2048, + "l1_cache_kb": 192, + "l2_cache_mb": 40, + "memory_gb": 80, + "memory_type": "HBM2e", + "form_factor": "PCIe", + "tdp_w": 300, + }, + # NVIDIA H100 SKUs - SXM5 Variant + "NVIDIA H100 SXM5 80GB": { + "name": "NVIDIA H100 SXM5 80GB", + "architecture": "Hopper", + "peak_fp32_tflops": 67.0, + "peak_fp16_tflops": 1979.0, # Without sparsity + "peak_bf16_tflops": 1979.0, # Without sparsity + "peak_memory_bw_gbps": 3350, + "sm_count": 132, + "max_threads_per_sm": 2048, + "l1_cache_kb": 256, + "l2_cache_mb": 50, + "memory_gb": 80, + "memory_type": "HBM3", + "form_factor": "SXM5", + "tdp_w": 700, + }, + # NVIDIA H100 SKUs - PCIe Variant + "NVIDIA H100 PCIe 80GB": { + "name": "NVIDIA H100 PCIe 80GB", + "architecture": "Hopper", + "peak_fp32_tflops": 51.0, + "peak_fp16_tflops": 1513.0, # Without sparsity + "peak_bf16_tflops": 1513.0, # Without sparsity + "peak_memory_bw_gbps": 2000, + "sm_count": 114, + "max_threads_per_sm": 2048, + "l1_cache_kb": 256, + "l2_cache_mb": 50, + "memory_gb": 80, + "memory_type": "HBM2e", + "form_factor": "PCIe", + "tdp_w": 350, + }, + # NVIDIA H100 SKUs - NVL Variant (for LLM inference) + "NVIDIA H100 NVL 94GB": { + "name": "NVIDIA H100 NVL 94GB", + "architecture": "Hopper", + "peak_fp32_tflops": 60.0, + "peak_fp16_tflops": 1671.0, # Without sparsity + "peak_bf16_tflops": 1671.0, # Without sparsity + "peak_memory_bw_gbps": 3900, + "sm_count": 132, + "max_threads_per_sm": 2048, + "l1_cache_kb": 256, + "l2_cache_mb": 50, + "memory_gb": 94, + "memory_type": "HBM3", + "form_factor": "PCIe", + "tdp_w": 400, + }, + # NVIDIA RTX 4090 + "NVIDIA RTX 4090": { + "name": "NVIDIA RTX 4090", + "architecture": "Ada Lovelace", + "peak_fp32_tflops": 82.58, + "peak_fp16_tflops": 82.58, + "peak_bf16_tflops": 82.58, + "peak_memory_bw_gbps": 1008, + "sm_count": 128, + "max_threads_per_sm": 1536, + "l1_cache_kb": 128, + "l2_cache_mb": 72, + "memory_gb": 24, + "memory_type": "GDDR6X", + "form_factor": "PCIe", + "tdp_w": 450, + }, + # NVIDIA RTX 5080 + "NVIDIA RTX 5080": { + "name": "NVIDIA RTX 5080", + "architecture": "Blackwell", + "peak_fp32_tflops": 56.28, + "peak_fp16_tflops": 56.28, + "peak_bf16_tflops": 56.28, + "peak_memory_bw_gbps": 960, + "sm_count": 84, + "max_threads_per_sm": 1536, + "l1_cache_kb": 128, + "l2_cache_mb": 64, + "memory_gb": 16, + "memory_type": "GDDR7", + "form_factor": "PCIe", + "tdp_w": 360, + }, +} diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py new file mode 100644 index 0000000..67fbb13 --- /dev/null +++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py @@ -0,0 +1,302 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Bottleneck Analysis Prompt Builder + +Provides prompt templates and parsing utilities for LLM-based bottleneck analysis +of NCU profiling metrics. + +Bottleneck Categories: +- memory: Memory bandwidth is the limiting factor +- compute: Compute throughput is the limiting factor +- underutilized: Neither saturated (<60% both), indicating stalls/occupancy issues + +Metric definitions are in metric_schema.py. +""" + +import json +import re +from dataclasses import dataclass, field +from typing import Any + +from kernel_perf_agent.kernel_opt.diagnose_prompt.metric_schema import ( + GPU_MEMORY_FIELDS, + GPU_SPEC_FIELDS, + NCU_METRIC_SECTIONS, +) +from kernel_perf_agent.kernel_opt.roofline.ncu_roofline import RooflineResult + +BOTTLENECK_CATEGORIES = {"memory", "compute", "underutilized"} + + +# ============================================================================= +# Data Classes +# ============================================================================= + + +@dataclass +class BottleneckResult: + """A single bottleneck analysis.""" + + category: str + summary: str + reasoning: str + root_causes: list[dict[str, Any]] = field(default_factory=list) + recommended_fixes: list[dict[str, Any]] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return { + "category": self.category, + "summary": self.summary, + "reasoning": self.reasoning, + "root_causes": self.root_causes, + "recommended_fixes": self.recommended_fixes, + } + + +# ============================================================================= +# Prompt Template +# ============================================================================= + + +BOTTLENECK_PROMPT = """\ +You are a GPU performance expert analyzing Triton kernel profiling data. + +## Task +Analyze the NCU metrics and identify {num_bottlenecks} performance bottleneck(s). For each, classify as: +- **memory**: Memory bandwidth is the limiting factor +- **compute**: Compute throughput is the limiting factor +- **underutilized**: Neither saturated (<60% both), indicating stalls/occupancy issues + +## GPU Specifications +{gpu_specs} + +## Roofline Analysis +- Bottleneck: {roofline_bottleneck} +- Compute SOL: {compute_sol:.1f}% +- Memory SOL: {memory_sol:.1f}% +- Efficiency: {efficiency:.1f}% +- Headroom: {headroom:.1f}% +- At Roofline: {at_roofline} +- Tensor Cores: {uses_tc} +- Warnings: {roofline_warnings} + +## NCU Metrics +{ncu_metrics} + +## Kernel Code +```python +{kernel_code} +``` + +## Output (JSON array, no markdown fence) +[ + {{ + "category": "memory" | "compute" | "underutilized", + "summary": "One-line summary", + "reasoning": "Explanation citing metrics", + "root_causes": [ + {{ + "cause": "Description", + "evidence": [{{"metric": "name", "value": 0.0, "interpretation": "meaning"}}] + }} + ], + "recommended_fixes": [ + {{"fix": "Actionable instruction", "rationale": "Why"}} + ] + }} +] + +Requirements: +- Provide exactly {num_bottlenecks} bottleneck analysis object(s) in the array. +- Order by importance (most critical first). +- Each bottleneck should have exactly {num_causes} root cause(s) and {num_fixes} corresponding fix(es). +- Keep summaries and reasoning concise and grounded in the provided metrics. +""" + + +# ============================================================================= +# Prompt Building +# ============================================================================= + + +def _fmt_value(v: Any) -> str: + """Format a value for display in prompts.""" + if isinstance(v, float): + return f"{v:.3g}" + if isinstance(v, int): + return str(v) + return str(v) + + +def _format_gpu_specs(gpu_specs: dict[str, Any]) -> str: + """Format GPU specifications using metric_schema definitions.""" + lines = [] + + for label, key, unit in GPU_SPEC_FIELDS: + value = gpu_specs.get(key) + if value is not None: + lines.append(f"- {label}: {_fmt_value(value)}{unit}") + + for label, size_key, type_key, unit in GPU_MEMORY_FIELDS: + size = gpu_specs.get(size_key) + mem_type = gpu_specs.get(type_key, "") + if size is not None: + type_str = f" {mem_type}" if mem_type else "" + lines.append(f"- {label}: {_fmt_value(size)}{unit}{type_str}") + + return "\n".join(lines) if lines else "N/A" + + +def _format_ncu_metrics(ncu_metrics: dict[str, Any]) -> str: + """Format NCU metrics grouped by section using metric_schema definitions.""" + lines = [] + + for section_name, metric_defs in NCU_METRIC_SECTIONS.items(): + section_lines = [] + for label, key, unit in metric_defs: + value = ncu_metrics.get(key) + if value is not None: + section_lines.append(f" - {label}: {_fmt_value(value)}{unit}") + + if section_lines: + lines.append(f"### {section_name}") + lines.extend(section_lines) + + schema_keys = {key for _, key, _ in sum(NCU_METRIC_SECTIONS.values(), [])} + other_keys = sorted(set(ncu_metrics.keys()) - schema_keys) + if other_keys: + lines.append("### Other Metrics") + for key in other_keys: + value = ncu_metrics[key] + lines.append(f" - {key}: {_fmt_value(value)}") + + return "\n".join(lines) if lines else "N/A" + + +def build_bottleneck_prompt( + kernel_code: str, + ncu_metrics: dict[str, Any], + roofline: RooflineResult, + gpu_specs: dict[str, Any], + num_bottlenecks: int = 2, + num_causes: int = 2, + num_fixes: int = 1, +) -> str: + """Build the bottleneck analysis prompt for the LLM. + + Args: + kernel_code: The Triton kernel source code. + ncu_metrics: NCU profiling metrics dictionary. + roofline: Roofline analysis result. + gpu_specs: GPU hardware specifications. + num_bottlenecks: Number of bottlenecks to request. + num_causes: Number of root causes per bottleneck. + num_fixes: Number of recommended fixes per bottleneck. + + Returns: + Formatted prompt string for the LLM. + """ + return BOTTLENECK_PROMPT.format( + num_bottlenecks=num_bottlenecks, + num_causes=num_causes, + num_fixes=num_fixes, + gpu_specs=_format_gpu_specs(gpu_specs), + roofline_bottleneck=roofline.bottleneck, + compute_sol=roofline.compute_sol_pct, + memory_sol=roofline.memory_sol_pct, + efficiency=roofline.efficiency_pct, + headroom=roofline.headroom_pct, + at_roofline="Yes" if roofline.at_roofline else "No", + uses_tc="Yes" if roofline.uses_tensor_cores else "No", + roofline_warnings="; ".join(roofline.warnings) or "None", + ncu_metrics=_format_ncu_metrics(ncu_metrics), + kernel_code=kernel_code, + ) + + +# ============================================================================= +# Response Parsing +# ============================================================================= + + +def parse_bottleneck_response( + response: str, + fallback_category: str = "underutilized", +) -> list[BottleneckResult]: + """Parse LLM response into a list of BottleneckResult. + + Args: + response: Raw LLM response text. + fallback_category: Category to use if parsing fails. + + Returns: + List of BottleneckResult. Empty list if parsing fails completely. + """ + # Try to find JSON array + array_match = re.search(r"\[[\s\S]*\]", response) + if array_match: + try: + data = json.loads(array_match.group()) + if isinstance(data, list): + return _parse_bottleneck_list(data, fallback_category) + except json.JSONDecodeError: + pass + + # Fall back to single object + obj_match = re.search(r"\{[\s\S]*\}", response) + if obj_match: + try: + data = json.loads(obj_match.group()) + if isinstance(data, dict): + return _parse_bottleneck_list([data], fallback_category) + except json.JSONDecodeError: + pass + + return [] + + +def _parse_bottleneck_list( + items: list[dict[str, Any]], + fallback_category: str, +) -> list[BottleneckResult]: + """Parse a list of bottleneck dicts into BottleneckResult objects.""" + results = [] + for item in items: + category = item.get("category", fallback_category) + if category not in BOTTLENECK_CATEGORIES: + category = fallback_category + + root_causes = [ + {"cause": rc.get("cause", "Unknown"), "evidence": rc.get("evidence", [])} + for rc in item.get("root_causes", []) + ] + + fixes = [ + {"fix": f.get("fix", ""), "rationale": f.get("rationale", "")} + for f in item.get("recommended_fixes", []) + ] + + results.append( + BottleneckResult( + category=category, + summary=item.get("summary", f"{category}-bound"), + reasoning=item.get("reasoning", ""), + root_causes=root_causes, + recommended_fixes=fixes, + ) + ) + + return results diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/metric_schema.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/metric_schema.py new file mode 100644 index 0000000..64d1d67 --- /dev/null +++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/metric_schema.py @@ -0,0 +1,151 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Metric Schema Definitions for NCU Profiling and GPU Specifications. + +This module provides the single source of truth for: +- NCU profiling metric definitions (keys, labels, units) +- GPU specification field definitions + +Schema Format: List of tuples (display_label, key, unit_suffix) +- display_label: Human-readable name shown in prompts +- key: NCU metric key or GPU spec dictionary key +- unit_suffix: Unit to append after value (e.g., "%", " GB/s", " bytes") +""" + +from typing import Dict, List, Tuple + +# Type alias for metric definition: (label, key, unit) +MetricDef = Tuple[str, str, str] + +# ============================================================================= +# GPU Specification Fields +# ============================================================================= + +GPU_SPEC_FIELDS: List[MetricDef] = [ + ("Name", "name", ""), + ("Architecture", "architecture", ""), + ("Peak Memory Bandwidth", "peak_memory_bw_gbps", " GB/s"), + ("Peak FP32 Performance", "peak_fp32_tflops", " TFLOPS"), + ("Peak FP16 Performance", "peak_fp16_tflops", " TFLOPS"), + ("SM Count", "sm_count", ""), + ("Max Threads per SM", "max_threads_per_sm", ""), + ("L1 Cache per SM", "l1_cache_kb", " KB"), + ("L2 Cache (Total)", "l2_cache_mb", " MB"), +] + +# Special case: Memory Size has two fields combined +GPU_MEMORY_FIELDS: List[Tuple[str, str, str, str]] = [ + # (label, size_key, type_key, size_unit) + ("Memory Size", "memory_gb", "memory_type", " GB"), +] + +# ============================================================================= +# NCU Profiling Metric Sections +# ============================================================================= + +NCU_METRIC_SECTIONS: Dict[str, List[MetricDef]] = { + "SM & Compute Utilization": [ + ("SM Cycles Active", "sm__cycles_active.avg", ""), + ("Warp Active", "sm__warps_active.avg.pct_of_peak_sustained_active", "%"), + ("Total Instructions Executed", "sm__inst_executed.sum", ""), + ( + "Tensor Core Utilization", + "sm__inst_executed_pipe_tensor.avg.pct_of_peak_sustained_active", + "%", + ), + ( + "Tensor Core Pipeline Active", + "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_elapsed", + "%", + ), + ], + "Memory Bandwidth & Cache": [ + ( + "DRAM Throughput", + "dram__throughput.avg.pct_of_peak_sustained_elapsed", + "%", + ), + ("DRAM Bandwidth", "dram__bytes.sum.per_second", " bytes/sec"), + ( + "GPU DRAM Throughput", + "gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed", + "%", + ), + ("DRAM Bytes Read", "dram__bytes_read.sum", " bytes"), + ("DRAM Bytes Write", "dram__bytes_write.sum", " bytes"), + ("L1 Cache Hit Rate", "l1tex__t_sector_hit_rate.pct", "%"), + ( + "L1 Throughput", + "l1tex__throughput.avg.pct_of_peak_sustained_active", + "%", + ), + ("L2 Cache Hit Rate", "lts__t_sector_hit_rate.pct", "%"), + ( + "L2 Throughput", + "lts__throughput.avg.pct_of_peak_sustained_active", + "%", + ), + ], + "Memory Access Patterns": [ + ( + "Memory Coalescing", + "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct", + "%", + ), + ( + "Branch Uniformity", + "smsp__sass_average_branch_targets_threads_uniform.pct", + "%", + ), + ], + "Occupancy & Resources": [ + ("Occupancy Limited By Blocks", "launch__occupancy_limit_blocks", ""), + ("Occupancy Limited By Registers", "launch__occupancy_limit_registers", ""), + ( + "Occupancy Limited By Shared Memory", + "launch__occupancy_limit_shared_mem", + "", + ), + ("Registers per Thread", "launch__registers_per_thread", ""), + ( + "Shared Memory per Block", + "launch__shared_mem_per_block_allocated", + " bytes", + ), + ], + "Stall Metrics (Warp Issue Stalls)": [ + ( + "Short Scoreboard Stalls", + "smsp__warp_issue_stalled_short_scoreboard_per_warp_active.pct", + "%", + ), + ( + "Long Scoreboard Stalls", + "smsp__warp_issue_stalled_long_scoreboard_per_warp_active.pct", + "%", + ), + ( + "Barrier Stalls", + "smsp__warp_issue_stalled_barrier_per_warp_active.pct", + "%", + ), + ( + "Branch Resolving Stalls", + "smsp__warp_issue_stalled_branch_resolving_per_warp_active.pct", + "%", + ), + ], +} diff --git a/kernel_perf_agent/kernel_opt/profiler/__init__.py b/kernel_perf_agent/kernel_opt/profiler/__init__.py index d177194..d0a028c 100644 --- a/kernel_perf_agent/kernel_opt/profiler/__init__.py +++ b/kernel_perf_agent/kernel_opt/profiler/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""kernel_perf_agent package.""" +"""NCU profiling module for kernel performance analysis.""" -# Kernel Perf Agent package __all__ = [] diff --git a/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py b/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py index 4ce8568..4b1bf83 100644 --- a/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py +++ b/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py @@ -255,12 +255,8 @@ def _apply_selection_policy( Returns: DataFrame with a single row based on the policy """ - if df.empty: + if len(df) <= 1: return df - - if len(df) == 1: - return df - if policy == MetricSelectionPolicy.FIRST: return df.iloc[[0]] elif policy == MetricSelectionPolicy.LAST: @@ -317,7 +313,7 @@ def load_ncu_metrics( extra_keep: Optional[Sequence[str]] = ("Kernel Name",), coerce_numeric: bool = True, name_list: Optional[Sequence[str]] = None, - select: Union[str, MetricSelectionPolicy] = MetricSelectionPolicy.LAST, + select: MetricSelectionPolicy = MetricSelectionPolicy.LAST, ) -> pd.DataFrame: """ Load and parse NCU metrics from CSV file. @@ -328,32 +324,19 @@ def load_ncu_metrics( extra_keep: Additional columns to keep (e.g., "Kernel Name") coerce_numeric: Convert metric values to numeric name_list: Filter by kernel names (substring match) - select: Selection policy when multiple rows per name. - Can be MetricSelectionPolicy enum or string ("first", "last", "max_cycles") + select: Selection policy when multiple rows per name Returns: DataFrame with parsed metrics Raises: FileNotFoundError: If CSV file not found - ValueError: If no requested columns found in CSV or invalid select value + ValueError: If no requested columns found in CSV """ csv_path = Path(csv_path) if not csv_path.exists(): raise FileNotFoundError(f"CSV not found: {csv_path}") - # Convert string to enum if needed - if isinstance(select, str): - try: - policy = MetricSelectionPolicy(select) - except ValueError: - raise ValueError( - f"Invalid select value: {select}. " - f"Must be one of: {[p.value for p in MetricSelectionPolicy]}" - ) - else: - policy = select - df = pd.read_csv(csv_path, comment="=", low_memory=False) metric_cols = list(columns) if columns is not None else METRIC_COLUMNS @@ -383,14 +366,11 @@ def load_ncu_metrics( .apply(pd.to_numeric, errors="coerce") ) - # Filter by kernel name list if provided - if name_list: - sub = _filter_by_kernel_names(sub, name_list, policy, keep_cols) - else: - # Apply selection to all rows if no name filter - sub = _apply_selection_policy(sub, policy) - - return sub + return ( + _filter_by_kernel_names(sub, name_list, select, keep_cols) + if name_list + else _apply_selection_policy(sub, select) + ) def metrics_to_prompt( diff --git a/kernel_perf_agent/kernel_opt/roofline/__init__.py b/kernel_perf_agent/kernel_opt/roofline/__init__.py new file mode 100644 index 0000000..f3d8afe --- /dev/null +++ b/kernel_perf_agent/kernel_opt/roofline/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Roofline analysis module for kernel performance optimization.""" + +__all__ = [] diff --git a/kernel_perf_agent/kernel_opt/roofline/ncu_roofline.py b/kernel_perf_agent/kernel_opt/roofline/ncu_roofline.py new file mode 100644 index 0000000..6e2131c --- /dev/null +++ b/kernel_perf_agent/kernel_opt/roofline/ncu_roofline.py @@ -0,0 +1,251 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Roofline Analysis Module using NCU SOL (Speed of Light) Metrics. + +This module uses NCU's built-in SOL metrics to determine kernel efficiency +relative to hardware limits + +NCU SOL metrics directly measure how close performance is to peak: +- Compute SOL: SM throughput as % of peak +- Memory SOL: DRAM throughput as % of peak + +Updated in January 2026 +""" + +import logging +from dataclasses import dataclass, field +from typing import Any + + +# NCU metrics needed for roofline analysis +# Note: The profiler (ncu_profiler.py) collects these and more metrics. +# This list documents the minimum required for roofline decisions. + +NCU_ROOFLINE_METRICS = [ + # Primary SOL metrics (Speed of Light) + "gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed", # Memory SOL + "sm__throughput.avg.pct_of_peak_sustained_elapsed", # Compute SOL + # Tensor core detection + "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active", +] + + +@dataclass +class RooflineConfig: + """Configuration for roofline analysis.""" + + threshold_pct: float = 90.0 # SOL % to consider at roofline + early_stop: bool = True # Stop optimization when at roofline + convergence_rounds: int = 5 # Rounds without improvement to trigger stop + min_improvement_pct: float = 0.1 # Minimum improvement to continue + tensor_core_threshold: float = 5.0 # Min TC activity % to consider TC usage + underutilized_threshold: float = 60.0 # Both SOL < this % = underutilized + + +@dataclass +class RooflineResult: + """Result of roofline analysis using NCU SOL metrics.""" + + # SOL metrics from NCU (primary) + compute_sol_pct: float # SM throughput as % of peak + memory_sol_pct: float # DRAM throughput as % of peak + + # Derived efficiency (max of compute/memory SOL) + efficiency_pct: float # Primary efficiency metric for decisions + at_roofline: bool # True if efficiency >= threshold_pct + headroom_pct: float # 100 - efficiency + + # Classification + bottleneck: str # "memory" | "compute" | "underutilized" + uses_tensor_cores: bool # Whether TC is active + + # Data quality + warnings: list[str] = field(default_factory=list) + + +class RooflineAnalyzer: + """Analyzes kernel performance using NCU SOL metrics.""" + + def __init__( + self, + config: RooflineConfig | None = None, + logger: logging.Logger | None = None, + ): + """ + Initialize the roofline analyzer. + + Args: + config: Roofline configuration (defaults to RooflineConfig()) + logger: Logger instance + """ + self.config = config or RooflineConfig() + self.logger = logger or logging.getLogger(__name__) + self._efficiency_history: list[float] = [] + + def _is_using_tensor_cores(self, ncu_metrics: dict[str, Any]) -> bool: + """Detect tensor core usage from NCU metrics.""" + tc_cycles = ncu_metrics.get( + "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active", 0 + ) + return tc_cycles > self.config.tensor_core_threshold + + def _classify_bottleneck(self, compute_sol: float, memory_sol: float) -> str: + """ + Classify bottleneck based on SOL metrics. + + The LOWER SOL value indicates the bottleneck. + If both are lower than threshold, the kernel is underutilized (could be occupancy, + instruction mix, launch config, dependency stalls, etc.). + """ + threshold = self.config.underutilized_threshold + + # Both low = underutilized (neither resource is saturated) + if memory_sol < threshold and compute_sol < threshold: + return "underutilized" + + # Return whichever is lower + if memory_sol <= compute_sol: + return "memory" + else: + return "compute" + + def analyze( + self, + ncu_metrics: dict[str, Any], + ) -> RooflineResult: + """ + Analyze kernel performance using NCU SOL metrics. + + Args: + ncu_metrics: NCU profiling metrics dictionary + + Returns: + RooflineResult with SOL-based efficiency analysis + """ + warnings: list[str] = [] + + # Extract SOL metrics with missing-key detection + compute_key = "sm__throughput.avg.pct_of_peak_sustained_elapsed" + memory_key = "gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed" + + compute_missing = compute_key not in ncu_metrics + memory_missing = memory_key not in ncu_metrics + + if compute_missing: + self.logger.warning("Compute SOL metric missing from NCU data") + warnings.append("Compute SOL metric missing") + if memory_missing: + self.logger.warning("Memory SOL metric missing from NCU data") + warnings.append("Memory SOL metric missing") + + # Fail only if both keys are absent + if compute_missing and memory_missing: + return RooflineResult( + compute_sol_pct=0, + memory_sol_pct=0, + efficiency_pct=0, + at_roofline=False, + headroom_pct=100, + bottleneck="unknown", + uses_tensor_cores=False, + warnings=["Analysis failed - no SOL metrics in NCU data"], + ) + + compute_sol = ncu_metrics.get(compute_key, 0) + memory_sol = ncu_metrics.get(memory_key, 0) + + # Primary efficiency: use max of compute/memory + efficiency = max(compute_sol, memory_sol) + + # Tensor core detection + uses_tc = self._is_using_tensor_cores(ncu_metrics) + + # Classify bottleneck + bottleneck = self._classify_bottleneck(compute_sol, memory_sol) + + # Check if at roofline + at_roofline = efficiency >= self.config.threshold_pct + + return RooflineResult( + compute_sol_pct=compute_sol, + memory_sol_pct=memory_sol, + efficiency_pct=efficiency, + at_roofline=at_roofline, + headroom_pct=max(0, 100 - efficiency), + bottleneck=bottleneck, + uses_tensor_cores=uses_tc, + warnings=warnings, + ) + + def should_stop(self, result: RooflineResult) -> tuple[bool, str]: + """ + Check if optimization should stop based on SOL efficiency and convergence. + + Args: + result: RooflineResult from analyze() + + Returns: + Tuple of (should_stop, reason) + """ + self._efficiency_history.append(result.efficiency_pct) + + # Condition 1: At roofline threshold (if early_stop enabled) + if self.config.early_stop and result.at_roofline: + return ( + True, + f"At roofline ({result.efficiency_pct:.1f}% SOL >= " + f"{self.config.threshold_pct}%)", + ) + + # Condition 2: Efficiency converged (no improvement for N rounds) + if len(self._efficiency_history) >= self.config.convergence_rounds: + recent = self._efficiency_history[-self.config.convergence_rounds :] + improvement = max(recent) - min(recent) + if improvement < self.config.min_improvement_pct: + return ( + True, + f"Converged (improvement {improvement:.2f}% < " + f"{self.config.min_improvement_pct}%)", + ) + + return False, "" + + def reset_history(self) -> None: + """Reset efficiency history for a new optimization run.""" + self._efficiency_history = [] + + +def format_roofline_summary(result: RooflineResult) -> str: + """Format a human-readable summary of roofline analysis.""" + lines = [ + "=== Roofline Analysis ===", + f"SOL Efficiency: {result.efficiency_pct:.1f}%", + f" Compute SOL: {result.compute_sol_pct:.1f}%", + f" Memory SOL: {result.memory_sol_pct:.1f}%", + f" Bottleneck: {result.bottleneck}", + f" Tensor Cores: {'Yes' if result.uses_tensor_cores else 'No'}", + "", + ] + + if result.at_roofline: + lines.append("Status: AT ROOFLINE") + else: + lines.append(f"Headroom: {result.headroom_pct:.1f}%") + + if result.warnings: + lines.append(f"Warnings: {'; '.join(result.warnings)}") + + return "\n".join(lines) diff --git a/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py b/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py index 733216c..fe1e84f 100644 --- a/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py +++ b/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py @@ -288,12 +288,11 @@ def main(): args = _parse_args() device = torch.device(args.device) - dtype_map = { + dtype = { "float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16, - } - dtype = dtype_map[args.dtype] + }[args.dtype] if not args.quiet: print("=" * 80) diff --git a/triton_kernel_agent/opt_worker_component/prescribing/__init__.py b/triton_kernel_agent/opt_worker_component/prescribing/__init__.py new file mode 100644 index 0000000..c515b2a --- /dev/null +++ b/triton_kernel_agent/opt_worker_component/prescribing/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Prescribing module for kernel optimization.""" + +from .bottleneck_analyzer import BottleneckAnalyzer + +__all__ = ["BottleneckAnalyzer"] diff --git a/triton_kernel_agent/opt_worker_component/prescribing/bottleneck_analyzer.py b/triton_kernel_agent/opt_worker_component/prescribing/bottleneck_analyzer.py new file mode 100644 index 0000000..b774f0a --- /dev/null +++ b/triton_kernel_agent/opt_worker_component/prescribing/bottleneck_analyzer.py @@ -0,0 +1,159 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Bottleneck Analyzer - LLM-based NCU profiling analysis. + +This module orchestrates LLM calls for bottleneck analysis using: +- judger_prompt.py: Prompt template, parsing, BottleneckResult dataclass +- ncu_roofline.py: Roofline analysis using NCU SOL metrics + +Bottleneck Categories: +- memory: Memory bandwidth is the limiting factor +- compute: Compute throughput is the limiting factor +- underutilized: Neither saturated (<60% both), indicating stalls/occupancy issues +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Any + +from kernel_perf_agent.kernel_opt.diagnose_prompt.judger_prompt import ( + BottleneckResult, + build_bottleneck_prompt, + parse_bottleneck_response, +) +from kernel_perf_agent.kernel_opt.roofline.ncu_roofline import RooflineAnalyzer +from triton_kernel_agent.worker_util import _call_llm, _save_debug_file +from utils.providers.base import BaseProvider + + +class BottleneckAnalyzer: + """LLM-based bottleneck analyzer using NCU metrics.""" + + def __init__( + self, + provider: BaseProvider, + model: str, + gpu_specs: dict[str, Any], + logs_dir: Path | None = None, + logger: logging.Logger | None = None, + num_bottlenecks: int = 1, + num_causes: int = 2, + num_fixes: int = 1, + enable_debug: bool = True, + ): + """ + Initialize bottleneck analyzer. + + Args: + provider: LLM provider instance + model: Model name for LLM calls + gpu_specs: GPU hardware specifications + logs_dir: Directory for saving debug files + logger: Logger instance + num_bottlenecks: Number of bottlenecks to request from LLM + num_causes: Number of root causes per bottleneck + num_fixes: Number of recommended fixes per bottleneck + enable_debug: Whether to save debug files (prompts/responses) + """ + self.provider = provider + self.model = model + self.gpu_specs = gpu_specs + self.logs_dir = logs_dir + self.logger = logger or logging.getLogger(__name__) + self.num_bottlenecks = num_bottlenecks + self.num_causes = num_causes + self.num_fixes = num_fixes + self.enable_debug = enable_debug + self.roofline = RooflineAnalyzer(logger=logger) + + def analyze( + self, + kernel_code: str, + ncu_metrics: dict[str, Any], + round_num: int = 0, + roofline_result: Any = None, + ) -> list[BottleneckResult]: + """ + Analyze kernel bottlenecks using LLM. + + Args: + kernel_code: The Triton kernel source code + ncu_metrics: NCU profiling metrics dictionary + round_num: Current optimization round (for logging) + roofline_result: Pre-computed RooflineResult (if None, computed internally) + + Returns: + List of BottleneckResult (ordered by importance). + Empty list if analysis fails. + """ + if roofline_result is None: + # Filter out PyTorch kernels (at::*) and get Triton kernel metrics + if ncu_metrics: + triton_kernels = { + name: metrics + for name, metrics in ncu_metrics.items() + if not name.startswith("at::") and not name.startswith("void at::") + } + flat_metrics = ( + next(iter(triton_kernels.values())) + if triton_kernels + else next(iter(ncu_metrics.values()), {}) + ) + else: + flat_metrics = {} + roofline_result = self.roofline.analyze(flat_metrics) + + prompt = build_bottleneck_prompt( + kernel_code=kernel_code, + ncu_metrics=ncu_metrics, + roofline=roofline_result, + gpu_specs=self.gpu_specs, + num_bottlenecks=self.num_bottlenecks, + num_causes=self.num_causes, + num_fixes=self.num_fixes, + ) + + response = _call_llm( + provider=self.provider, + model=self.model, + messages=[{"role": "user", "content": prompt}], + logger=self.logger, + max_tokens=16384, + ) + + if self.enable_debug and self.logs_dir: + _save_debug_file( + self.logs_dir / f"round{round_num:03d}_bottleneck_prompt.txt", + prompt, + self.logger, + ) + _save_debug_file( + self.logs_dir / f"round{round_num:03d}_bottleneck_response.txt", + response, + self.logger, + ) + + results = parse_bottleneck_response(response) + + if results: + categories = [r.category for r in results] + self.logger.info(f"[{round_num}] Bottlenecks: {', '.join(categories)}") + else: + self.logger.warning(f"[{round_num}] Failed to parse bottleneck response") + + return results diff --git a/triton_kernel_agent/prompt_manager.py b/triton_kernel_agent/prompt_manager.py index 9534fc9..7252aea 100644 --- a/triton_kernel_agent/prompt_manager.py +++ b/triton_kernel_agent/prompt_manager.py @@ -88,6 +88,7 @@ def _load_templates(self): "test_generation": "test_generation.j2", "kernel_generation": "kernel_generation.j2", "kernel_refinement": "kernel_refinement.j2", + "kernel_optimization": "kernel_optimization.j2", "triton_guidelines": "triton_guidelines.j2", } @@ -194,6 +195,64 @@ def render_kernel_refinement_prompt( no_cusolver=no_cusolver, ) + def render_kernel_optimization_prompt( + self, + problem_description: str, + kernel_code: str, + gpu_specs: dict, + roofline: dict, + category: str, + summary: str, + reasoning: str, + root_cause: dict, + recommended_fix: dict, + pytorch_baseline_ms: float | None = None, + current_best_ms: float | None = None, + error_feedback: str | None = None, + ) -> str: + """ + Render the kernel optimization prompt. + + Args: + problem_description: Description of the problem + kernel_code: Current kernel implementation + gpu_specs: GPU hardware specifications dict + roofline: Roofline analysis result dict with keys: + bottleneck, compute_sol_pct, memory_sol_pct, efficiency_pct, + headroom_pct, at_roofline, uses_tensor_cores, warnings + category: Bottleneck category ("memory", "compute", "underutilized") + summary: One-line bottleneck summary + reasoning: Explanation citing metrics + root_cause: Single root cause dict {"cause": "...", "evidence": [...]} + recommended_fix: Single fix dict {"fix": "...", "rationale": "..."} + pytorch_baseline_ms: PyTorch Eager baseline time in ms + current_best_ms: Current best kernel time in ms (for iterative opt) + error_feedback: Error message from previous failed attempt + + Returns: + Rendered prompt string + """ + template = self.templates["kernel_optimization"] + + bottleneck = { + "category": category, + "summary": summary, + "reasoning": reasoning, + "root_cause": root_cause, + "recommended_fix": recommended_fix, + } + + return template.render( + problem_description=problem_description, + kernel_code=kernel_code, + gpu_specs=gpu_specs, + roofline=roofline, + bottleneck=bottleneck, + pytorch_baseline_ms=pytorch_baseline_ms, + current_best_ms=current_best_ms, + error_feedback=error_feedback, + ) + def render_triton_guidelines(self) -> str: """ Render the Triton guidelines. diff --git a/triton_kernel_agent/templates/kernel_optimization.j2 b/triton_kernel_agent/templates/kernel_optimization.j2 new file mode 100644 index 0000000..92fe699 --- /dev/null +++ b/triton_kernel_agent/templates/kernel_optimization.j2 @@ -0,0 +1,101 @@ +{# +Copyright (c) Meta Platforms, Inc. and affiliates. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +#} + +TASK: Optimize the following Triton kernel based on hardware profiling analysis to achieve better performance. + +{% if gpu_specs %} +## TARGET GPU +- GPU: {{ gpu_specs.name }} +- Architecture: {{ gpu_specs.architecture }} +- Peak Memory Bandwidth: {{ gpu_specs.peak_memory_bw_gbps }} GB/s +- Peak FP32: {{ gpu_specs.peak_fp32_tflops }} TFLOPS +- Peak FP16: {{ gpu_specs.peak_fp16_tflops }} TFLOPS +- Peak BF16: {{ gpu_specs.peak_bf16_tflops }} TFLOPS +- SM Count: {{ gpu_specs.sm_count }} +- Max Threads per SM: {{ gpu_specs.max_threads_per_sm }} +- L1 Cache per SM: {{ gpu_specs.l1_cache_kb }} KB +- L2 Cache: {{ gpu_specs.l2_cache_mb }} MB +- Memory: {{ gpu_specs.memory_gb }} GB {{ gpu_specs.memory_type }} +{% endif %} + +## PROBLEM DESCRIPTION +{{ problem_description }} +{% if pytorch_baseline_ms %} +PyTorch Eager baseline: {{ "%.4f"|format(pytorch_baseline_ms) }} ms +{% endif %} + +## CURRENT KERNEL +```python +{{ kernel_code }} +``` + +{% if roofline %} +## ROOFLINE ANALYSIS +- Primary Bottleneck: {{ roofline.bottleneck | upper }} +- Compute SOL: {{ "%.1f"|format(roofline.compute_sol_pct) }}% +- Memory SOL: {{ "%.1f"|format(roofline.memory_sol_pct) }}% +- Efficiency: {{ "%.1f"|format(roofline.efficiency_pct) }}% (headroom: {{ "%.1f"|format(roofline.headroom_pct) }}%) +- At Roofline: {{ "Yes" if roofline.at_roofline else "No" }} +- Tensor Cores: {{ "Active" if roofline.uses_tensor_cores else "Inactive" }} +{%- if roofline.warnings %} +- Warnings: {{ roofline.warnings | join("; ") }} +{%- endif %} +{% endif %} + +## BOTTLENECK ANALYSIS +### Category: {{ bottleneck.category | upper }} +{{ bottleneck.summary }} + +**Reasoning:** {{ bottleneck.reasoning }} + +**Root Cause:** {{ bottleneck.root_cause.cause }} +{%- if bottleneck.root_cause.evidence %} + Evidence: {% for e in bottleneck.root_cause.evidence %}{{ e.metric }}={{ e.value }}{% if not loop.last %}, {% endif %}{% endfor %} +{%- endif %} + +**Recommended Fix:** {{ bottleneck.recommended_fix.fix }} +{%- if bottleneck.recommended_fix.rationale %} ({{ bottleneck.recommended_fix.rationale }}){% endif %} + +{% if error_feedback %} +## PREVIOUS ATTEMPT FAILED +{{ error_feedback }} +{% endif %} + +## PERFORMANCE TARGET +{% if pytorch_baseline_ms %} +- PyTorch Eager baseline: {{ "%.4f"|format(pytorch_baseline_ms) }} ms +{% endif %} +{% if current_best_ms %} +- Current best kernel: {{ "%.4f"|format(current_best_ms) }} ms +- Target: Improve by at least 10% (< {{ "%.4f"|format(current_best_ms * 0.9) }} ms) +{% else %} +- Target: Improve by at least 10% over Eager (< {{ "%.4f"|format(pytorch_baseline_ms * 0.9) }} ms) +{% endif %} +- Maintain numerical correctness (atol=1e-4 or rtol=1e-4) +- Preserve public API (same inputs/outputs, shapes, dtypes) + +## REQUIREMENTS +1. Apply the recommended fixes above to address the {{ bottleneck.category | upper }} bottleneck +2. The implementation must be a complete, valid Python file +3. Main function must be named 'kernel_function' wrapping the Triton kernel +4. Keep the wrapper free of PyTorch compute primitives + +## OUTPUT FORMAT +Output complete optimized kernel code in ```python blocks. +Include only: imports, Triton kernel (@triton.jit), wrapper function (kernel_function). +No testing code, benchmarks, or explanatory comments. + +Generate the complete optimized kernel implementation: