[Bench] Add TorchSlmSize benchmark (#20937)

lukaszstolarczuk · web-flow · commit 98636646c85d · 2025-12-19T04:42:54.000-08:00
and add verbose option to bench's integration tests.
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -61,8 +61,8 @@ def git_url(self) -> str:
         return "https://github.com/intel/compute-benchmarks.git"
 
     def git_hash(self) -> str:
-        # Nov 17, 2025
-        return "932ae79f7cca7e156285fc10a59610927c769e89"
+        # Dec 17, 2025
+        return "420549188cd8900c27cf9b04fd859ebe81876a99"
 
     def setup(self) -> None:
         if options.sycl is None:
@@ -182,7 +182,7 @@ def benchmarks(self) -> list[Benchmark]:
                 GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
             )
 
-            # Add ULLS benchmarks
+        # Add ULLS benchmarks
         for runtime in list(RUNTIMES):
             if runtime == RUNTIMES.SYCL:
                 benches.append(
@@ -355,6 +355,36 @@ def createTorchMultiQueueBench(variant_name: str, **kwargs):
                 ),
             ]
 
+        # Add TorchSlmSize benchmarks
+        for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
+
+            def createTorchSlmSizeBench(variant_name: str, **kwargs):
+                return TorchSlmSize(
+                    self,
+                    runtime,
+                    variant_name,
+                    PROFILERS.TIMER,
+                    **{**kwargs, "warmupIterations": 1},
+                )
+
+            benches += [
+                createTorchSlmSizeBench(
+                    "small",
+                    batchSize=512,
+                    slmNum=1,
+                ),
+                createTorchSlmSizeBench(
+                    "medium",
+                    batchSize=512,
+                    slmNum=1024,
+                ),
+                createTorchSlmSizeBench(
+                    "max",
+                    batchSize=512,
+                    slmNum=-1,
+                ),
+            ]
+
         # Add UR-specific benchmarks
         benches += [
             # TODO: multithread_benchmark_ur fails with segfault
@@ -810,25 +840,31 @@ def _bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         return [f"--{k}={v}" for k, v in self._rr_params.items()]
 
 
-class TorchMultiQueue(ComputeBenchmark):
+class TorchBenchmark(ComputeBenchmark):
     def __init__(
-        self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
+        self,
+        suite,
+        runtime: RUNTIMES,
+        bench_name: str,
+        variant_name: str,
+        profiler_type,
+        **kwargs,
     ):
         self._variant_name = variant_name
-        self._smq_params = kwargs
+        self._torch_params = kwargs
         self._iterations_regular = 1000
         self._iterations_trace = 10
         super().__init__(
             suite,
             f"torch_benchmark_{runtime.value}",
-            "KernelSubmitMultiQueue",
+            bench_name,
             runtime,
             profiler_type,
         )
 
     def name(self):
         ret = []
-        for k, v in self._smq_params.items():
+        for k, v in self._torch_params.items():
             ret.append(f"{k} {v}")
         ret.sort()
         return self._bench_name + " " + ", ".join(ret)
@@ -848,10 +884,38 @@ def _supported_runtimes(self) -> list[RUNTIMES]:
     def _bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self._get_iters(run_trace)
         return [f"--iterations={iters}"] + [
-            f"--{k}={v}" for k, v in self._smq_params.items()
+            f"--{k}={v}" for k, v in self._torch_params.items()
         ]
 
 
+class TorchMultiQueue(TorchBenchmark):
+    def __init__(
+        self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
+    ):
+        super().__init__(
+            suite,
+            runtime,
+            "KernelSubmitMultiQueue",
+            variant_name,
+            profiler_type,
+            **kwargs,
+        )
+
+
+class TorchSlmSize(TorchBenchmark):
+    def __init__(
+        self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
+    ):
+        super().__init__(
+            suite,
+            runtime,
+            "KernelSubmitSlmSize",
+            variant_name,
+            profiler_type,
+            **kwargs,
+        )
+
+
 class QueueInOrderMemcpy(ComputeBenchmark):
     def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
         self._is_copy_only = isCopyOnly
diff --git a/devops/scripts/benchmarks/tests/test_integration.py b/devops/scripts/benchmarks/tests/test_integration.py
@@ -3,6 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import argparse
 import json
 import os
 import shutil
@@ -15,6 +16,7 @@
 sys.path.append(f"{os.path.dirname(__file__)}/../")
 from utils.workdir_version import INTERNAL_WORKDIR_VERSION
 
+VERBOSE_LOGS = False
 
 DataJson = namedtuple("DataJson", ["runs", "metadata", "tags", "names"])
 DataJsonRun = namedtuple("DataJsonRun", ["name", "results"])
@@ -65,7 +67,7 @@ def run_main(self, *args):
 
         # TODO: not yet tested: "--detect-version", "sycl,compute_runtime"
 
-        procesResult = subprocess.run(
+        proc = subprocess.run(
             [
                 "./devops/scripts/benchmarks/main.py",
                 self.WORKDIR_DIR,
@@ -86,13 +88,14 @@ def run_main(self, *args):
                 "--stddev-threshold",
                 "999999999.9",
                 "--exit-on-failure",
+                "--verbose" if VERBOSE_LOGS else "--log-level=info",
                 *args,
             ],
             capture_output=True,
         )
-        print("MAIN_PY_STDOUT:\n" + procesResult.stdout.decode())
-        print("MAIN_PY_STDERR:\n" + procesResult.stderr.decode())
-        return procesResult.returncode
+        print("MAIN_PY_STDOUT:\n" + proc.stdout.decode() if proc.stdout else "<empty>")
+        print("MAIN_PY_STDERR:\n" + proc.stderr.decode() if proc.stderr else "<empty>")
+        return proc.returncode
 
     def get_output(self):
         with open(os.path.join(self.OUTPUT_DIR, "data.json")) as f:
@@ -136,9 +139,6 @@ def get_output(self):
             )
 
 
-# add "--verbose" for debug logs
-
-
 class TestE2E(unittest.TestCase):
     def setUp(self):
         # Load test data
@@ -194,20 +194,46 @@ def test_torch_l0(self):
             "KernelSubmitMultiQueue large",
             {"pytorch", "L0"},
         )
+        self._checkCase(
+            "torch_benchmark_l0 batchSize 512, slmNum 1, warmupIterations 1",
+            "KernelSubmitSlmSize small",
+            {"pytorch", "L0"},
+        )
 
     def test_torch_sycl(self):
         self._checkCase(
             "torch_benchmark_sycl kernelsPerQueue 10, workgroupCount 512, workgroupSize 256",
             "KernelSubmitMultiQueue medium",
             {"pytorch", "SYCL"},
         )
+        self._checkCase(
+            "torch_benchmark_sycl batchSize 512, slmNum -1, warmupIterations 1",
+            "KernelSubmitSlmSize max",
+            {"pytorch", "SYCL"},
+        )
 
     def test_torch_syclpreview(self):
         self._checkCase(
             "torch_benchmark_syclpreview kernelsPerQueue 4, workgroupCount 256, workgroupSize 124",
             "KernelSubmitMultiQueue small",
             {"pytorch", "SYCL"},
         )
+        self._checkCase(
+            "torch_benchmark_syclpreview batchSize 512, slmNum 1024, warmupIterations 1",
+            "KernelSubmitSlmSize medium",
+            {"pytorch", "SYCL"},
+        )
+
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SYCL's benchmark test framework")
+    parser.add_argument(
+        "--verbose",
+        help="Set benchmark framework's logging level to DEBUG.",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+    VERBOSE_LOGS = args.verbose
+
     unittest.main()