Add DGX A100 baseline

roclark · roclark · commit 0325dffe5318 · 2021-03-12T13:50:05.000-06:00
A simple baseline should be created to verify performance for DGX A100
PODs of various scale.

Signed-Off-By: Robert Clark &lt;roclark@nvidia.com&gt;
diff --git a/bobber/lib/analysis/compare_baseline.py b/bobber/lib/analysis/compare_baseline.py
@@ -4,7 +4,7 @@
 from bobber.lib.exit_codes import BASELINE_FAILURE
 from bobber.lib.analysis.common import bcolors
 from bobber.lib.system.file_handler import read_yaml
-from typing import Optional, Tuple
+from typing import NoReturn, Optional, Tuple
 
 
 # Map the dicitonary keys in the baseline to human-readable names.
@@ -111,6 +111,8 @@ def evaluate_fio(baselines: dict, results: dict, test_name: str, failures: int,
         threshold.
     """
     for test, value in baselines.items():
+        if test_name not in results.keys():
+            continue
         if test_name == 'bandwidth':
             unit = '(GB/s)'
             expected = value / 1000000000
@@ -155,6 +157,8 @@ def evaluate_nccl(baseline: dict, results: dict, failures: int,
         Returns an ``integer`` of the number of results that have not met the
         threshold.
     """
+    if 'max_bus_bw' not in baseline.keys():
+        return failures
     print('  NCCL Max Bus Bandwidth (GB/s)')
     expected = baseline['max_bus_bw']
     got = results['nccl']['max_bus_bw']
@@ -196,6 +200,8 @@ def evaluate_dali(baselines: dict, results: dict, test_name: str,
         threshold.
     """
     for test, value in baselines.items():
+        if test not in results.keys():
+            continue
         print(f'  DALI {test} (images/second)')
         expected = value
         got = round(results[test]['average images/second'], 3)
@@ -208,7 +214,7 @@ def evaluate_dali(baselines: dict, results: dict, test_name: str,
 
 
 def evaluate_test(baseline: dict, results: dict, system_count: int,
-                  tolerance: int):
+                  tolerance: int, failures: int) -> int:
     """
     Evaluate all tests for N-nodes and compare against the baseline.
 
@@ -228,9 +234,16 @@ def evaluate_test(baseline: dict, results: dict, system_count: int,
     tolerance : int
         An ``int`` of the percentage below the threshold to still mark as
         passing.
-    """
-    failures = 0
+    failures : int
+        An ``integer`` of the number of results that have not met the
+        threshold.
 
+    Returns
+    -------
+    int
+        Returns an ``integer`` of the number of results that have not met the
+        threshold.
+    """
     for test_name, test_values in baseline.items():
         print('-' * 80)
         if test_name in ['bandwidth', 'iops']:
@@ -244,18 +257,11 @@ def evaluate_test(baseline: dict, results: dict, system_count: int,
                                      test_name,
                                      failures,
                                      tolerance)
-
-    if failures > 0:
-        print('-' * 80)
-        print(f'{failures} tests did not meet the suggested criteria!')
-        print('See results above for failed tests and verify setup.')
-        # Throw a non-zero exit status so any tools that read codes will catch
-        # that the baseline was not met.
-        sys.exit(BASELINE_FAILURE)
+    return failures
 
 
 def compare_baseline(results: dict, baseline: str, tolerance: int,
-                     custom: Optional[bool] = False):
+                     custom: Optional[bool] = False) -> NoReturn:
     """
     Compare a baseline against parsed results.
 
@@ -281,6 +287,8 @@ def compare_baseline(results: dict, baseline: str, tolerance: int,
         passed from a YAML file. If `False`, it will compare against an
         included baseline.
     """
+    failures = 0
+
     print('=' * 80)
     print('Baseline assessment')
     if custom:
@@ -299,9 +307,18 @@ def compare_baseline(results: dict, baseline: str, tolerance: int,
             print('Skipping...')
             continue
         print(f' {system_count} System(s)')
-        evaluate_test(baseline_results,
-                      results['systems'][str(system_count)],
-                      system_count,
-                      tolerance)
+        failures = evaluate_test(baseline_results,
+                                 results['systems'][str(system_count)],
+                                 system_count,
+                                 tolerance,
+                                 failures)
+
+    if failures > 0:
+        print('-' * 80)
+        print(f'{failures} test(s) did not meet the suggested criteria!')
+        print('See results above for failed tests and verify setup.')
+        # Throw a non-zero exit status so any tools that read codes will catch
+        # that the baseline was not met.
+        sys.exit(BASELINE_FAILURE)
 
     print('=' * 80)
diff --git a/bobber/lib/constants.py b/bobber/lib/constants.py
@@ -83,6 +83,196 @@
     }
 }
 
+DGX_A100_POD_BASELINE = {
+    'systems': {
+        '1': {
+            'bandwidth': {
+                # FIO BW speed in bytes/second
+                'read': 2250000000,
+                'write': 875000000
+            },
+            'iops': {
+                # FIO IOPS speed in ops/second
+                'read': 87500,
+                'write': 16250
+            },
+            'nccl': {
+                # NCCL maximum bus bandwidth in GB/s
+                'max_bus_bw': 230
+            },
+            'dali': {
+                # DALI average speed in images/second
+                '800x600 standard jpg': 2000,
+                '3840x2160 standard jpg': 1000,
+                '800x600 tfrecord': 4000,
+                '3840x2160 tfrecord': 1000
+            }
+        },
+        '2': {
+            'bandwidth': {
+                # FIO BW speed in bytes/second
+                'read': 4500000000,
+                'write': 1750000000
+            },
+            'iops': {
+                # FIO IOPS speed in ops/second
+                'read': 175000,
+                'write': 32500
+            },
+            'nccl': {
+                # NCCL maximum bus bandwidth in GB/s
+                'max_bus_bw': 180
+            },
+            'dali': {
+                # DALI average speed in images/second
+                '800x600 standard jpg': 4000,
+                '3840x2160 standard jpg': 2000,
+                '800x600 tfrecord': 8000,
+                '3840x2160 tfrecord': 2000
+            }
+        },
+        '3': {
+            'bandwidth': {
+                # FIO BW speed in bytes/second
+                'read': 6750000000,
+                'write': 2625000000
+            },
+            'iops': {
+                # FIO IOPS speed in ops/second
+                'read': 262500,
+                'write': 48750
+            },
+            'nccl': {
+                # NCCL maximum bus bandwidth in GB/s
+                'max_bus_bw': 180
+            },
+            'dali': {
+                # DALI average speed in images/second
+                '800x600 standard jpg': 6000,
+                '3840x2160 standard jpg': 3000,
+                '800x600 tfrecord': 12000,
+                '3840x2160 tfrecord': 3000
+            }
+        },
+        '4': {
+            'bandwidth': {
+                # FIO BW speed in bytes/second
+                'read': 9000000000,
+                'write': 3500000000
+            },
+            'iops': {
+                # FIO IOPS speed in ops/second
+                'read': 350000,
+                'write': 65000
+            },
+            'nccl': {
+                # NCCL maximum bus bandwidth in GB/s
+                'max_bus_bw': 180
+            },
+            'dali': {
+                # DALI average speed in images/second
+                '800x600 standard jpg': 8000,
+                '3840x2160 standard jpg': 4000,
+                '800x600 tfrecord': 16000,
+                '3840x2160 tfrecord': 4000
+            }
+        },
+        '5': {
+            'bandwidth': {
+                # FIO BW speed in bytes/second
+                'read': 11250000000,
+                'write': 4375000000
+            },
+            'iops': {
+                # FIO IOPS speed in ops/second
+                'read': 437500,
+                'write': 81250
+            },
+            'nccl': {
+                # NCCL maximum bus bandwidth in GB/s
+                'max_bus_bw': 180
+            },
+            'dali': {
+                # DALI average speed in images/second
+                '800x600 standard jpg': 20000,
+                '3840x2160 standard jpg': 5000,
+                '800x600 tfrecord': 20000,
+                '3840x2160 tfrecord': 5000
+            }
+        },
+        '6': {
+            'bandwidth': {
+                # FIO BW speed in bytes/second
+                'read': 13500000000,
+                'write': 5250000000
+            },
+            'iops': {
+                # FIO IOPS speed in ops/second
+                'read': 525000,
+                'write': 97500
+            },
+            'nccl': {
+                # NCCL maximum bus bandwidth in GB/s
+                'max_bus_bw': 180
+            },
+            'dali': {
+                # DALI average speed in images/second
+                '800x600 standard jpg': 24000,
+                '3840x2160 standard jpg': 6000,
+                '800x600 tfrecord': 24000,
+                '3840x2160 tfrecord': 6000
+            }
+        },
+        '7': {
+            'bandwidth': {
+                # FIO BW speed in bytes/second
+                'read': 15750000000,
+                'write': 6125000000
+            },
+            'iops': {
+                # FIO IOPS speed in ops/second
+                'read': 612500,
+                'write': 113750
+            },
+            'nccl': {
+                # NCCL maximum bus bandwidth in GB/s
+                'max_bus_bw': 180
+            },
+            'dali': {
+                # DALI average speed in images/second
+                '800x600 standard jpg': 28000,
+                '3840x2160 standard jpg': 7000,
+                '800x600 tfrecord': 28000,
+                '3840x2160 tfrecord': 7000
+            }
+        },
+        '8': {
+            'bandwidth': {
+                # FIO BW speed in bytes/second
+                'read': 18000000000,
+                'write': 7000000000
+            },
+            'iops': {
+                # FIO IOPS speed in ops/second
+                'read': 700000,
+                'write': 130000
+            },
+            'nccl': {
+                # NCCL maximum bus bandwidth in GB/s
+                'max_bus_bw': 180
+            },
+            'dali': {
+                # DALI average speed in images/second
+                '800x600 standard jpg': 32000,
+                '3840x2160 standard jpg': 8000,
+                '800x600 tfrecord': 32000,
+                '3840x2160 tfrecord': 8000
+            }
+        }
+    }
+}
+
 BASELINES = {
-    'single-dgx-station-baseline': SINGLE_DGX_STATION_BASELINE
+    'single-dgx-station-baseline': SINGLE_DGX_STATION_BASELINE,
+    'dgx-a100-pod-baseline': DGX_A100_POD_BASELINE
 }