Skip to content
Open
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
7b55ce5
first commit
MarcelWilnicki Sep 25, 2025
1ede310
wip
MarcelWilnicki Sep 25, 2025
a31e5e1
wip
MarcelWilnicki Sep 25, 2025
6d5003d
wip
MarcelWilnicki Sep 26, 2025
776e70d
wip
MarcelWilnicki Sep 26, 2025
b3c4a0c
wip
MarcelWilnicki Sep 26, 2025
f8c8b06
wip
MarcelWilnicki Sep 26, 2025
df07b65
wip
MarcelWilnicki Sep 26, 2025
6626f91
wip
MarcelWilnicki Sep 26, 2025
01cb4de
wip
MarcelWilnicki Sep 30, 2025
bc405d6
wip
MarcelWilnicki Sep 30, 2025
ddf25ed
wip
MarcelWilnicki Oct 1, 2025
c4e81b0
wip
MarcelWilnicki Oct 1, 2025
c7764d4
wip
MarcelWilnicki Oct 2, 2025
146e3b0
wip
MarcelWilnicki Oct 2, 2025
83d284e
wip
MarcelWilnicki Oct 2, 2025
93ed7b4
wip
MarcelWilnicki Oct 3, 2025
07a1a34
wip
MarcelWilnicki Oct 3, 2025
ccddf0a
wip
MarcelWilnicki Oct 7, 2025
2a518e7
wip
MarcelWilnicki Oct 7, 2025
fe213de
wip
MarcelWilnicki Oct 7, 2025
02d4de6
wip
MarcelWilnicki Oct 7, 2025
d720b06
wip
MarcelWilnicki Oct 7, 2025
c682f18
wip
MarcelWilnicki Oct 7, 2025
2b56ab0
wip
MarcelWilnicki Oct 8, 2025
23e4287
wip
MarcelWilnicki Oct 8, 2025
871de3a
wip
MarcelWilnicki Oct 16, 2025
5ff4486
wip
MarcelWilnicki Oct 17, 2025
ee2acbd
wip
MarcelWilnicki Oct 17, 2025
0b9196f
wip
MarcelWilnicki Oct 17, 2025
5e97ac4
wip
MarcelWilnicki Oct 17, 2025
3ae639f
wip
MarcelWilnicki Oct 17, 2025
27af2ee
wip
MarcelWilnicki Oct 17, 2025
84eaf81
wip
MarcelWilnicki Oct 17, 2025
7181c43
wip
MarcelWilnicki Oct 17, 2025
47e0779
wip
MarcelWilnicki Oct 17, 2025
5ca5249
wip
MarcelWilnicki Oct 17, 2025
c2e309d
wip
MarcelWilnicki Oct 20, 2025
fba70f6
wip
MarcelWilnicki Oct 20, 2025
242ca72
wip
MarcelWilnicki Oct 20, 2025
e5f63ab
wip
MarcelWilnicki Oct 20, 2025
8628d9e
wip
MarcelWilnicki Oct 20, 2025
572eb52
wip
MarcelWilnicki Oct 20, 2025
58b26c7
wip
MarcelWilnicki Oct 22, 2025
6c47e2f
wip
MarcelWilnicki Oct 22, 2025
006ebb1
wip
MarcelWilnicki Oct 22, 2025
13e9fd7
wip
MarcelWilnicki Oct 22, 2025
706fe6e
wip
MarcelWilnicki Oct 22, 2025
ad8cf9f
wip
MarcelWilnicki Oct 22, 2025
8983fae
wip
MarcelWilnicki Oct 22, 2025
b22bd26
wip
MarcelWilnicki Oct 23, 2025
51f3b94
wip
MarcelWilnicki Oct 23, 2025
3f70599
wip
MarcelWilnicki Oct 24, 2025
4d8ec72
wip
MarcelWilnicki Oct 30, 2025
cfb802a
wip
MarcelWilnicki Oct 30, 2025
0face5a
wip
MarcelWilnicki Oct 31, 2025
14f0ab3
wip
MarcelWilnicki Oct 31, 2025
8351e91
wip
MarcelWilnicki Oct 31, 2025
0f8b11a
wip
MarcelWilnicki Oct 31, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ jobs:
wget https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/aio_objdet_dataset.tar.gz > /dev/null 2>&1
tar -xf aio_objdet_dataset.tar.gz > /dev/null

apt-get update && apt-get install -y ffmpeg
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be handled by setup_deb.sh . One of the goals of this test to make sure all deps are installed by it, so if something is missing it should be added there.


wget $S3_URL_RESNET_50_V15_TF_FP32 > /dev/null 2>&1
IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60

Expand Down Expand Up @@ -149,24 +151,24 @@ jobs:
tar -xf aio_objdet_dataset.tar.gz > /dev/null

wget $S3_URL_RESNET_50_V15_TF_FP32 > /dev/null 2>&1
IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use envs starting in line 115 for this. No need to specify for each process separately.


IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60

wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt > /dev/null 2>&1
IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8n.pt -f pytorch -p fp32 --timeout=60
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8n.pt -f pytorch -p fp32 --timeout=60

python3 speech_recognition/whisper/run.py -m small.en

wget $S3_URL_SSD_INCEPTION_V2_TF_FP32 > /dev/null 2>&1
IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/ssd_inception_v2/run.py -m ssd_inception_v2_tf_fp32.pb -p fp32 --timeout=60
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/ssd_inception_v2/run.py -m ssd_inception_v2_tf_fp32.pb -p fp32 --timeout=60

wget https://zenodo.org/records/4735647/files/resnet50_v1.onnx > /dev/null 2>&1
IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort

wget https://s3.amazonaws.com/onnx-model-zoo/vgg/vgg16/vgg16.tar.gz > /dev/null 2>&1
tar -xf vgg16.tar.gz > /dev/null
IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort

test_pytorch_arm64_sh:
if: false
Expand Down Expand Up @@ -257,21 +259,21 @@ jobs:
tar -xf aio_objdet_dataset.tar.gz > /dev/null

wget https://github.com/tloen/alpaca-lora/raw/main/alpaca_data.json > /dev/null 2>&1
AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json

AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32

IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch

AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en

IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60

wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt > /dev/null 2>&1
IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch

wget -O bert_large_mlperf.pt https://zenodo.org/records/3733896/files/model.pytorch?download=1 > /dev/null 2>&1
AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch

test_tensorflow_arm64:
runs-on: self-hosted
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright (c) 2024, Ampere Computing LLC
Copyright (c) 2025, Ampere Computing LLC
Copyright (c) 2022 Andrej Karpathy
Copyright (c) 2022 OpenAI
Copyright (c) 2022 Stability AI
Expand Down
2 changes: 1 addition & 1 deletion computer_vision/object_detection/yolo_v5/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down
12 changes: 6 additions & 6 deletions computer_vision/object_detection/yolo_v8/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down Expand Up @@ -61,15 +61,15 @@ def run_ort_fp32(model_path, batch_size, num_runs, timeout, images_path, anno_pa
# Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
# to set it to True if needed
from utils.ort import OrtRunner
from ultralytics.yolo.utils import ops
from ultralytics.utils import nms

def run_single_pass(ort_runner, coco):
shape = (640, 640)
ort_runner.set_input_tensor("images", coco.get_input_array(shape).astype("float32"))
output = ort_runner.run(batch_size)

output = torch.from_numpy(output[0])
output = ops.non_max_suppression(output)
output = nms.non_max_suppression(output)

for i in range(batch_size):
for d in range(output[i].shape[0]):
Expand Down Expand Up @@ -97,11 +97,11 @@ def run_pytorch_fp(model_path, batch_size, num_runs, timeout, images_path, anno_
# Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
# to set it to True if needed
from utils.pytorch import PyTorchRunner
from ultralytics.yolo.utils import ops
from ultralytics.utils import nms

def run_single_pass(pytorch_runner, coco):
output = pytorch_runner.run(batch_size, coco.get_input_array((640, 640)))
output = ops.non_max_suppression(output)
output = nms.non_max_suppression(output)

for i in range(batch_size):
for d in range(output[i].shape[0]):
Expand All @@ -121,7 +121,7 @@ def run_single_pass(pytorch_runner, coco):

runner = PyTorchRunner(torch.jit.load(torchscript_model),
disable_jit_freeze=disable_jit_freeze,
example_inputs=torch.stack(dataset.get_input_array((640, 640))))
example_inputs=torch.stack((dataset.get_input_array((640, 640)),)))

return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down Expand Up @@ -43,6 +43,10 @@ def parse_args():
parser.add_argument("--squad_path",
type=str,
help="path to directory with ImageNet validation images")
parser.add_argument("--fixed_input", action='store_true',
help="truncate input to fixed shape")
parser.add_argument("--input_size", type=int, default=384,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Input_size will only take effect when fixed_input is True. Therefore I suggest making a single arg out of this that both enabled fixed_input and specifies the size. Say --fixed_input_size.

help='size of the input')
parser.add_argument("--disable_jit_freeze", action='store_true',
help="if true model will be run not in jit freeze mode")
return parser.parse_args()
Expand Down Expand Up @@ -93,7 +97,8 @@ def run_tf_fp16(model_path, batch_size, num_runs, timeout, squad_path, **kwargs)
return run_tf_fp(model_path, batch_size, num_runs, timeout, squad_path)


def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze=False):
def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path,
input_size, disable_jit_freeze=False, fixed_input=False):
from utils.benchmark import run_model
from utils.nlp.squad import Squad_v1_1
from transformers import AutoTokenizer, BertConfig, BertForQuestionAnswering
Expand All @@ -117,7 +122,11 @@ def run_single_pass(pytorch_runner, squad):
padding=True, truncation=True, model_max_length=512)

def tokenize(question, text):
return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")
if fixed_input:
return tokenizer(question, text, padding="max_length", truncation=True,
max_length=input_size, return_tensors="pt")
else:
return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")

def detokenize(answer):
return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer))
Expand Down Expand Up @@ -199,8 +208,10 @@ def detokenize(answer):
return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)


def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze, **kwargs):
return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze)
def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path,
input_size, disable_jit_freeze, fixed_input, **kwargs):
return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path,
input_size, disable_jit_freeze, fixed_input)


def main():
Expand Down
2 changes: 1 addition & 1 deletion recommendation/dlrm/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ tiktoken
ultralytics
evaluate
datasets
datasets[audio]
soundfile
librosa
numba
Expand All @@ -35,4 +36,4 @@ kornia
open-clip-torch<2.26.1
diffusers
accelerate
boto3==1.29.0; python_version>='3.12'
boto3==1.29.0; python_version>='3.12'
7 changes: 5 additions & 2 deletions tests/test_pytorch_models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
import os
import signal
import time
Expand Down Expand Up @@ -222,7 +222,8 @@ def wrapper(**kwargs):

exact_match_ref, f1_ref = 0.750, 0.817
acc = run_process(wrapper, {"model_path": self.model_path, "squad_path": self.dataset_path,
"batch_size": 1, "num_runs": 24, "timeout": None, "disable_jit_freeze": False})
"batch_size": 1, "num_runs": 24, "timeout": None,
"input_size": 384, "disable_jit_freeze": False, "fixed_input": False})
self.assertTrue(acc["exact_match"] / exact_match_ref > 0.95)
self.assertTrue(acc["f1"] / f1_ref > 0.95)

Expand Down Expand Up @@ -367,6 +368,8 @@ def setUp(self):

def test_yolo_v8_s(self):
from computer_vision.object_detection.yolo_v8.run import run_pytorch_fp32
from utils.benchmark import set_global_intra_op_parallelism_threads
set_global_intra_op_parallelism_threads(32)

def wrapper(**kwargs):
kwargs["q"].put(run_pytorch_fp32(**kwargs)[0])
Expand Down
2 changes: 1 addition & 1 deletion utils/cv/pre_processing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
import numpy as np
import utils.misc as utils

Expand Down