AmpereComputingAI · MarcelWilnicki · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 26, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -84,6 +84,8 @@ jobs:
           wget https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/aio_objdet_dataset.tar.gz > /dev/null 2>&1
           tar -xf aio_objdet_dataset.tar.gz > /dev/null
 
+          apt-get update && apt-get install -y ffmpeg
+
           wget $S3_URL_RESNET_50_V15_TF_FP32 > /dev/null 2>&1
           IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60
 
@@ -149,24 +151,24 @@ jobs:
           tar -xf aio_objdet_dataset.tar.gz > /dev/null
 
           wget $S3_URL_RESNET_50_V15_TF_FP32 > /dev/null 2>&1
-          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60
 
-          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
 
           wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt > /dev/null 2>&1
-          IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8n.pt -f pytorch -p fp32 --timeout=60
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8n.pt -f pytorch -p fp32 --timeout=60
 
           python3 speech_recognition/whisper/run.py -m small.en
 
           wget $S3_URL_SSD_INCEPTION_V2_TF_FP32 > /dev/null 2>&1
-          IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/ssd_inception_v2/run.py -m ssd_inception_v2_tf_fp32.pb -p fp32 --timeout=60
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/ssd_inception_v2/run.py -m ssd_inception_v2_tf_fp32.pb -p fp32 --timeout=60
 
           wget https://zenodo.org/records/4735647/files/resnet50_v1.onnx > /dev/null 2>&1
-          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort
 
           wget https://s3.amazonaws.com/onnx-model-zoo/vgg/vgg16/vgg16.tar.gz > /dev/null 2>&1
           tar -xf vgg16.tar.gz > /dev/null
-          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort
 
   test_pytorch_arm64_sh:
     if: false
@@ -257,21 +259,21 @@ jobs:
           tar -xf aio_objdet_dataset.tar.gz > /dev/null
 
           wget https://github.com/tloen/alpaca-lora/raw/main/alpaca_data.json > /dev/null 2>&1
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json
 
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32
 
-          IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch
 
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en 
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en 
 
-          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
 
           wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt > /dev/null 2>&1
-          IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch              
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch              
 
           wget -O bert_large_mlperf.pt https://zenodo.org/records/3733896/files/model.pytorch?download=1 > /dev/null 2>&1
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch
 
   test_tensorflow_arm64:
     runs-on: self-hosted

diff --git a/LICENSE b/LICENSE
@@ -187,7 +187,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright (c) 2024, Ampere Computing LLC
+   Copyright (c) 2025, Ampere Computing LLC
    Copyright (c) 2022 Andrej Karpathy
    Copyright (c) 2022 OpenAI
    Copyright (c) 2022 Stability AI

diff --git a/computer_vision/object_detection/yolo_v5/run.py b/computer_vision/object_detection/yolo_v5/run.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2024, Ampere Computing LLC
+# Copyright (c) 2025, Ampere Computing LLC
 try:
     from utils import misc  # noqa
 except ModuleNotFoundError:

diff --git a/computer_vision/object_detection/yolo_v8/run.py b/computer_vision/object_detection/yolo_v8/run.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2024, Ampere Computing LLC
+# Copyright (c) 2025, Ampere Computing LLC
 try:
     from utils import misc  # noqa
 except ModuleNotFoundError:
@@ -61,15 +61,15 @@ def run_ort_fp32(model_path, batch_size, num_runs, timeout, images_path, anno_pa
     # Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
     # to set it to True if needed
     from utils.ort import OrtRunner
-    from ultralytics.yolo.utils import ops
+    from ultralytics.utils import nms
 
     def run_single_pass(ort_runner, coco):
         shape = (640, 640)
         ort_runner.set_input_tensor("images", coco.get_input_array(shape).astype("float32"))
         output = ort_runner.run(batch_size)
 
         output = torch.from_numpy(output[0])
-        output = ops.non_max_suppression(output)
+        output = nms.non_max_suppression(output)
 
         for i in range(batch_size):
             for d in range(output[i].shape[0]):
@@ -97,11 +97,11 @@ def run_pytorch_fp(model_path, batch_size, num_runs, timeout, images_path, anno_
     # Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
     # to set it to True if needed
     from utils.pytorch import PyTorchRunner
-    from ultralytics.yolo.utils import ops
+    from ultralytics.utils import nms
 
     def run_single_pass(pytorch_runner, coco):
         output = pytorch_runner.run(batch_size, coco.get_input_array((640, 640)))
-        output = ops.non_max_suppression(output)
+        output = nms.non_max_suppression(output)
 
         for i in range(batch_size):
             for d in range(output[i].shape[0]):
@@ -121,7 +121,7 @@ def run_single_pass(pytorch_runner, coco):
 
     runner = PyTorchRunner(torch.jit.load(torchscript_model),
                            disable_jit_freeze=disable_jit_freeze,
-                           example_inputs=torch.stack(dataset.get_input_array((640, 640))))
+                           example_inputs=torch.stack((dataset.get_input_array((640, 640)),)))
 
     return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)
 

diff --git a/natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py b/natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2024, Ampere Computing LLC
+# Copyright (c) 2025, Ampere Computing LLC
 try:
     from utils import misc  # noqa
 except ModuleNotFoundError:
@@ -43,6 +43,10 @@ def parse_args():
     parser.add_argument("--squad_path",
                         type=str,
                         help="path to directory with ImageNet validation images")
+    parser.add_argument("--fixed_input", action='store_true',
+                        help="truncate input to fixed shape")
+    parser.add_argument("--input_size", type=int, default=384,
+                        help='size of the input')
     parser.add_argument("--disable_jit_freeze", action='store_true',
                         help="if true model will be run not in jit freeze mode")
     return parser.parse_args()
@@ -93,7 +97,8 @@ def run_tf_fp16(model_path, batch_size, num_runs, timeout, squad_path, **kwargs)
     return run_tf_fp(model_path, batch_size, num_runs, timeout, squad_path)
 
 
-def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze=False):
+def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path,
+                   input_size, disable_jit_freeze=False, fixed_input=False):
     from utils.benchmark import run_model
     from utils.nlp.squad import Squad_v1_1
     from transformers import AutoTokenizer, BertConfig, BertForQuestionAnswering
@@ -117,7 +122,11 @@ def run_single_pass(pytorch_runner, squad):
         padding=True, truncation=True, model_max_length=512)
 
     def tokenize(question, text):
-        return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")
+        if fixed_input:
+            return tokenizer(question, text, padding="max_length", truncation=True,
+                             max_length=input_size, return_tensors="pt")
+        else:
+            return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")
 
     def detokenize(answer):
         return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer))
@@ -199,8 +208,10 @@ def detokenize(answer):
     return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)
 
 
-def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze, **kwargs):
-    return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze)
+def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path,
+                     input_size, disable_jit_freeze, fixed_input, **kwargs):
+    return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path,
+                          input_size, disable_jit_freeze, fixed_input)
 
 
 def main():

diff --git a/recommendation/dlrm/run.py b/recommendation/dlrm/run.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2024, Ampere Computing LLC
+# Copyright (c) 2025, Ampere Computing LLC
 try:
     from utils import misc  # noqa
 except ModuleNotFoundError:

diff --git a/requirements.txt b/requirements.txt
@@ -17,6 +17,7 @@ tiktoken
 ultralytics
 evaluate
 datasets
+datasets[audio]
 soundfile
 librosa
 numba
@@ -35,4 +36,4 @@ kornia
 open-clip-torch<2.26.1
 diffusers
 accelerate
-boto3==1.29.0; python_version>='3.12'
+boto3==1.29.0; python_version>='3.12'
diff --git a/tests/test_pytorch_models.py b/tests/test_pytorch_models.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2024, Ampere Computing LLC
+# Copyright (c) 2025, Ampere Computing LLC
 import os
 import signal
 import time
@@ -222,7 +222,8 @@ def wrapper(**kwargs):
 
         exact_match_ref, f1_ref = 0.750, 0.817
         acc = run_process(wrapper, {"model_path": self.model_path, "squad_path": self.dataset_path,
-                                    "batch_size": 1, "num_runs": 24, "timeout": None, "disable_jit_freeze": False})
+                                    "batch_size": 1, "num_runs": 24, "timeout": None,
+                                    "input_size": 384, "disable_jit_freeze": False, "fixed_input": False})
         self.assertTrue(acc["exact_match"] / exact_match_ref > 0.95)
         self.assertTrue(acc["f1"] / f1_ref > 0.95)
 
@@ -367,6 +368,8 @@ def setUp(self):
 
     def test_yolo_v8_s(self):
         from computer_vision.object_detection.yolo_v8.run import run_pytorch_fp32
+        from utils.benchmark import set_global_intra_op_parallelism_threads
+        set_global_intra_op_parallelism_threads(32)
 
         def wrapper(**kwargs):
             kwargs["q"].put(run_pytorch_fp32(**kwargs)[0])

diff --git a/utils/cv/pre_processing.py b/utils/cv/pre_processing.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2024, Ampere Computing LLC
+# Copyright (c) 2025, Ampere Computing LLC
 import numpy as np
 import utils.misc as utils