Skip to content

Commit 0325dff

Browse files
committed
Add DGX A100 baseline
A simple baseline should be created to verify performance for DGX A100 PODs of various scale. Signed-Off-By: Robert Clark <roclark@nvidia.com>
1 parent b01d22b commit 0325dff

File tree

2 files changed

+225
-18
lines changed

2 files changed

+225
-18
lines changed

bobber/lib/analysis/compare_baseline.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from bobber.lib.exit_codes import BASELINE_FAILURE
55
from bobber.lib.analysis.common import bcolors
66
from bobber.lib.system.file_handler import read_yaml
7-
from typing import Optional, Tuple
7+
from typing import NoReturn, Optional, Tuple
88

99

1010
# Map the dicitonary keys in the baseline to human-readable names.
@@ -111,6 +111,8 @@ def evaluate_fio(baselines: dict, results: dict, test_name: str, failures: int,
111111
threshold.
112112
"""
113113
for test, value in baselines.items():
114+
if test_name not in results.keys():
115+
continue
114116
if test_name == 'bandwidth':
115117
unit = '(GB/s)'
116118
expected = value / 1000000000
@@ -155,6 +157,8 @@ def evaluate_nccl(baseline: dict, results: dict, failures: int,
155157
Returns an ``integer`` of the number of results that have not met the
156158
threshold.
157159
"""
160+
if 'max_bus_bw' not in baseline.keys():
161+
return failures
158162
print(' NCCL Max Bus Bandwidth (GB/s)')
159163
expected = baseline['max_bus_bw']
160164
got = results['nccl']['max_bus_bw']
@@ -196,6 +200,8 @@ def evaluate_dali(baselines: dict, results: dict, test_name: str,
196200
threshold.
197201
"""
198202
for test, value in baselines.items():
203+
if test not in results.keys():
204+
continue
199205
print(f' DALI {test} (images/second)')
200206
expected = value
201207
got = round(results[test]['average images/second'], 3)
@@ -208,7 +214,7 @@ def evaluate_dali(baselines: dict, results: dict, test_name: str,
208214

209215

210216
def evaluate_test(baseline: dict, results: dict, system_count: int,
211-
tolerance: int):
217+
tolerance: int, failures: int) -> int:
212218
"""
213219
Evaluate all tests for N-nodes and compare against the baseline.
214220
@@ -228,9 +234,16 @@ def evaluate_test(baseline: dict, results: dict, system_count: int,
228234
tolerance : int
229235
An ``int`` of the percentage below the threshold to still mark as
230236
passing.
231-
"""
232-
failures = 0
237+
failures : int
238+
An ``integer`` of the number of results that have not met the
239+
threshold.
233240
241+
Returns
242+
-------
243+
int
244+
Returns an ``integer`` of the number of results that have not met the
245+
threshold.
246+
"""
234247
for test_name, test_values in baseline.items():
235248
print('-' * 80)
236249
if test_name in ['bandwidth', 'iops']:
@@ -244,18 +257,11 @@ def evaluate_test(baseline: dict, results: dict, system_count: int,
244257
test_name,
245258
failures,
246259
tolerance)
247-
248-
if failures > 0:
249-
print('-' * 80)
250-
print(f'{failures} tests did not meet the suggested criteria!')
251-
print('See results above for failed tests and verify setup.')
252-
# Throw a non-zero exit status so any tools that read codes will catch
253-
# that the baseline was not met.
254-
sys.exit(BASELINE_FAILURE)
260+
return failures
255261

256262

257263
def compare_baseline(results: dict, baseline: str, tolerance: int,
258-
custom: Optional[bool] = False):
264+
custom: Optional[bool] = False) -> NoReturn:
259265
"""
260266
Compare a baseline against parsed results.
261267
@@ -281,6 +287,8 @@ def compare_baseline(results: dict, baseline: str, tolerance: int,
281287
passed from a YAML file. If `False`, it will compare against an
282288
included baseline.
283289
"""
290+
failures = 0
291+
284292
print('=' * 80)
285293
print('Baseline assessment')
286294
if custom:
@@ -299,9 +307,18 @@ def compare_baseline(results: dict, baseline: str, tolerance: int,
299307
print('Skipping...')
300308
continue
301309
print(f' {system_count} System(s)')
302-
evaluate_test(baseline_results,
303-
results['systems'][str(system_count)],
304-
system_count,
305-
tolerance)
310+
failures = evaluate_test(baseline_results,
311+
results['systems'][str(system_count)],
312+
system_count,
313+
tolerance,
314+
failures)
315+
316+
if failures > 0:
317+
print('-' * 80)
318+
print(f'{failures} test(s) did not meet the suggested criteria!')
319+
print('See results above for failed tests and verify setup.')
320+
# Throw a non-zero exit status so any tools that read codes will catch
321+
# that the baseline was not met.
322+
sys.exit(BASELINE_FAILURE)
306323

307324
print('=' * 80)

bobber/lib/constants.py

Lines changed: 191 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,196 @@
8383
}
8484
}
8585

86+
DGX_A100_POD_BASELINE = {
87+
'systems': {
88+
'1': {
89+
'bandwidth': {
90+
# FIO BW speed in bytes/second
91+
'read': 2250000000,
92+
'write': 875000000
93+
},
94+
'iops': {
95+
# FIO IOPS speed in ops/second
96+
'read': 87500,
97+
'write': 16250
98+
},
99+
'nccl': {
100+
# NCCL maximum bus bandwidth in GB/s
101+
'max_bus_bw': 230
102+
},
103+
'dali': {
104+
# DALI average speed in images/second
105+
'800x600 standard jpg': 2000,
106+
'3840x2160 standard jpg': 1000,
107+
'800x600 tfrecord': 4000,
108+
'3840x2160 tfrecord': 1000
109+
}
110+
},
111+
'2': {
112+
'bandwidth': {
113+
# FIO BW speed in bytes/second
114+
'read': 4500000000,
115+
'write': 1750000000
116+
},
117+
'iops': {
118+
# FIO IOPS speed in ops/second
119+
'read': 175000,
120+
'write': 32500
121+
},
122+
'nccl': {
123+
# NCCL maximum bus bandwidth in GB/s
124+
'max_bus_bw': 180
125+
},
126+
'dali': {
127+
# DALI average speed in images/second
128+
'800x600 standard jpg': 4000,
129+
'3840x2160 standard jpg': 2000,
130+
'800x600 tfrecord': 8000,
131+
'3840x2160 tfrecord': 2000
132+
}
133+
},
134+
'3': {
135+
'bandwidth': {
136+
# FIO BW speed in bytes/second
137+
'read': 6750000000,
138+
'write': 2625000000
139+
},
140+
'iops': {
141+
# FIO IOPS speed in ops/second
142+
'read': 262500,
143+
'write': 48750
144+
},
145+
'nccl': {
146+
# NCCL maximum bus bandwidth in GB/s
147+
'max_bus_bw': 180
148+
},
149+
'dali': {
150+
# DALI average speed in images/second
151+
'800x600 standard jpg': 6000,
152+
'3840x2160 standard jpg': 3000,
153+
'800x600 tfrecord': 12000,
154+
'3840x2160 tfrecord': 3000
155+
}
156+
},
157+
'4': {
158+
'bandwidth': {
159+
# FIO BW speed in bytes/second
160+
'read': 9000000000,
161+
'write': 3500000000
162+
},
163+
'iops': {
164+
# FIO IOPS speed in ops/second
165+
'read': 350000,
166+
'write': 65000
167+
},
168+
'nccl': {
169+
# NCCL maximum bus bandwidth in GB/s
170+
'max_bus_bw': 180
171+
},
172+
'dali': {
173+
# DALI average speed in images/second
174+
'800x600 standard jpg': 8000,
175+
'3840x2160 standard jpg': 4000,
176+
'800x600 tfrecord': 16000,
177+
'3840x2160 tfrecord': 4000
178+
}
179+
},
180+
'5': {
181+
'bandwidth': {
182+
# FIO BW speed in bytes/second
183+
'read': 11250000000,
184+
'write': 4375000000
185+
},
186+
'iops': {
187+
# FIO IOPS speed in ops/second
188+
'read': 437500,
189+
'write': 81250
190+
},
191+
'nccl': {
192+
# NCCL maximum bus bandwidth in GB/s
193+
'max_bus_bw': 180
194+
},
195+
'dali': {
196+
# DALI average speed in images/second
197+
'800x600 standard jpg': 20000,
198+
'3840x2160 standard jpg': 5000,
199+
'800x600 tfrecord': 20000,
200+
'3840x2160 tfrecord': 5000
201+
}
202+
},
203+
'6': {
204+
'bandwidth': {
205+
# FIO BW speed in bytes/second
206+
'read': 13500000000,
207+
'write': 5250000000
208+
},
209+
'iops': {
210+
# FIO IOPS speed in ops/second
211+
'read': 525000,
212+
'write': 97500
213+
},
214+
'nccl': {
215+
# NCCL maximum bus bandwidth in GB/s
216+
'max_bus_bw': 180
217+
},
218+
'dali': {
219+
# DALI average speed in images/second
220+
'800x600 standard jpg': 24000,
221+
'3840x2160 standard jpg': 6000,
222+
'800x600 tfrecord': 24000,
223+
'3840x2160 tfrecord': 6000
224+
}
225+
},
226+
'7': {
227+
'bandwidth': {
228+
# FIO BW speed in bytes/second
229+
'read': 15750000000,
230+
'write': 6125000000
231+
},
232+
'iops': {
233+
# FIO IOPS speed in ops/second
234+
'read': 612500,
235+
'write': 113750
236+
},
237+
'nccl': {
238+
# NCCL maximum bus bandwidth in GB/s
239+
'max_bus_bw': 180
240+
},
241+
'dali': {
242+
# DALI average speed in images/second
243+
'800x600 standard jpg': 28000,
244+
'3840x2160 standard jpg': 7000,
245+
'800x600 tfrecord': 28000,
246+
'3840x2160 tfrecord': 7000
247+
}
248+
},
249+
'8': {
250+
'bandwidth': {
251+
# FIO BW speed in bytes/second
252+
'read': 18000000000,
253+
'write': 7000000000
254+
},
255+
'iops': {
256+
# FIO IOPS speed in ops/second
257+
'read': 700000,
258+
'write': 130000
259+
},
260+
'nccl': {
261+
# NCCL maximum bus bandwidth in GB/s
262+
'max_bus_bw': 180
263+
},
264+
'dali': {
265+
# DALI average speed in images/second
266+
'800x600 standard jpg': 32000,
267+
'3840x2160 standard jpg': 8000,
268+
'800x600 tfrecord': 32000,
269+
'3840x2160 tfrecord': 8000
270+
}
271+
}
272+
}
273+
}
274+
86275
BASELINES = {
87-
'single-dgx-station-baseline': SINGLE_DGX_STATION_BASELINE
276+
'single-dgx-station-baseline': SINGLE_DGX_STATION_BASELINE,
277+
'dgx-a100-pod-baseline': DGX_A100_POD_BASELINE
88278
}

0 commit comments

Comments
 (0)