From 486c1022a4de21010d204380263fe7dba510afd7 Mon Sep 17 00:00:00 2001 From: Pranav Chanduri Date: Mon, 15 Jun 2026 20:10:08 +0530 Subject: [PATCH 1/2] feat: add Trackio as a new experiment monitoring backend (#7964) - Add TrackioMonitor class in deepspeed/monitor/trackio.py - Add TrackioConfig with enabled and project fields in config.py - Add check_trackio_availability() helper in utils.py - Register TrackioMonitor in MonitorMaster in monitor.py - Trackio is a lightweight offline-first logging library with WandB-compatible API, logs can be visualized on HF Hub Signed-off-by: Pranav Chanduri --- deepspeed/monitor/config.py | 23 ++++++++++++++++++----- deepspeed/monitor/monitor.py | 8 +++++++- deepspeed/monitor/trackio.py | 33 +++++++++++++++++++++++++++++++++ deepspeed/monitor/utils.py | 12 ++++++++++-- 4 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 deepspeed/monitor/trackio.py diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py index 960ce1ba997a..c5c9869ec1dd 100644 --- a/deepspeed/monitor/config.py +++ b/deepspeed/monitor/config.py @@ -10,7 +10,7 @@ def get_monitor_config(param_dict): - monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor", "comet")} + monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor", "comet", "trackio")} return DeepSpeedMonitorConfig(**monitor_dict) @@ -23,7 +23,7 @@ class TensorBoardConfig(DeepSpeedConfigModel): output_path: str = "" """ Path to where the Tensorboard logs will be written. If not provided, the - output path is set under the training script’s launching path. + output path is set under the training script's launching path. """ job_name: str = "DeepSpeedJobName" @@ -55,7 +55,7 @@ class CSVConfig(DeepSpeedConfigModel): output_path: str = "" """ Path to where the csv files will be written. If not provided, the output - path is set under the training script’s launching path. + path is set under the training script's launching path. """ job_name: str = "DeepSpeedJobName" @@ -122,6 +122,16 @@ class CometConfig(DeepSpeedConfigModel): """ +class TrackioConfig(DeepSpeedConfigModel): + """Sets parameters for Trackio monitor.""" + + enabled: bool = False + """ Whether logging to Trackio is enabled. Requires `trackio` package is installed. """ + + project: str = "deepspeed" + """ Name for the Trackio project. """ + + class DeepSpeedMonitorConfig(DeepSpeedConfigModel): """Sets parameters for various monitoring methods.""" @@ -137,8 +147,11 @@ class DeepSpeedMonitorConfig(DeepSpeedConfigModel): csv_monitor: CSVConfig = {} """ Local CSV output of monitoring data. """ + trackio: TrackioConfig = {} + """ Trackio monitor, requires `trackio` package is installed. """ + @model_validator(mode="after") def check_enabled(self): - enabled = self.tensorboard.enabled or self.wandb.enabled or self.csv_monitor.enabled or self.comet.enabled + enabled = self.tensorboard.enabled or self.wandb.enabled or self.csv_monitor.enabled or self.comet.enabled or self.trackio.enabled self.__dict__["enabled"] = enabled - return self + return self \ No newline at end of file diff --git a/deepspeed/monitor/monitor.py b/deepspeed/monitor/monitor.py index e7e26dc483d9..1e8fd17646ec 100644 --- a/deepspeed/monitor/monitor.py +++ b/deepspeed/monitor/monitor.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # DeepSpeed Team + """ Support different forms of monitoring such as wandb and tensorboard """ @@ -25,6 +26,7 @@ def write_events(self, event_list): from .tensorboard import TensorBoardMonitor from .csv_monitor import csvMonitor from .comet import CometMonitor +from .trackio import TrackioMonitor class MonitorMaster(Monitor): @@ -35,8 +37,8 @@ def __init__(self, monitor_config): self.wandb_monitor = None self.csv_monitor = None self.comet_monitor = None + self.trackio_monitor = None self.enabled = monitor_config.enabled - if dist.get_rank() == 0: if monitor_config.tensorboard.enabled: self.tb_monitor = TensorBoardMonitor(monitor_config.tensorboard) @@ -46,6 +48,8 @@ def __init__(self, monitor_config): self.csv_monitor = csvMonitor(monitor_config.csv_monitor) if monitor_config.comet.enabled: self.comet_monitor = CometMonitor(monitor_config.comet) + if monitor_config.trackio.enabled: + self.trackio_monitor = TrackioMonitor(monitor_config.trackio) def write_events(self, event_list): if dist.get_rank() == 0: @@ -57,3 +61,5 @@ def write_events(self, event_list): self.csv_monitor.write_events(event_list) if self.comet_monitor is not None: self.comet_monitor.write_events(event_list) + if self.trackio_monitor is not None: + self.trackio_monitor.write_events(event_list) \ No newline at end of file diff --git a/deepspeed/monitor/trackio.py b/deepspeed/monitor/trackio.py new file mode 100644 index 000000000000..4efb192c18e9 --- /dev/null +++ b/deepspeed/monitor/trackio.py @@ -0,0 +1,33 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from .utils import check_trackio_availability +from .monitor import Monitor +import deepspeed.comm as dist + + +class TrackioMonitor(Monitor): + + def __init__(self, trackio_config): + super().__init__(trackio_config) + check_trackio_availability() + import trackio + self.enabled = trackio_config.enabled + self.project = trackio_config.project + if self.enabled and dist.get_rank() == 0: + trackio.init(project=self.project) + + def log(self, data, step=None): + if self.enabled and dist.get_rank() == 0: + import trackio + return trackio.log(data, step=step) + + def write_events(self, event_list): + if self.enabled and dist.get_rank() == 0: + for event in event_list: + label = event[0] + value = event[1] + step = event[2] + self.log({label: value}, step=step) \ No newline at end of file diff --git a/deepspeed/monitor/utils.py b/deepspeed/monitor/utils.py index f5530e8532e1..b4736c9d54cb 100644 --- a/deepspeed/monitor/utils.py +++ b/deepspeed/monitor/utils.py @@ -8,8 +8,6 @@ def check_tb_availability(): try: - # torch.utils.tensorboard will fail if `tensorboard` is not available, - # see their docs for more details: https://pytorch.org/docs/1.8.0/tensorboard.html import tensorboard # noqa: F401 # type: ignore except ImportError: print('If you want to use tensorboard logging, please `pip install tensorboard`') @@ -35,3 +33,13 @@ def check_comet_availability(): except ImportError: print('If you want to use comet logging, please `pip install "comet_ml>=3.41.0"`') raise + + +def check_trackio_availability(): + try: + import trackio # noqa: F401 # type: ignore + except ImportError: + print( + 'If you want to use Trackio logging, please `pip install trackio` and follow the instructions at https://github.com/huggingface/trackio' + ) + raise \ No newline at end of file From 7f928bda20be907ff52badbbdde5b2b67be51be2 Mon Sep 17 00:00:00 2001 From: Pranav Chanduri Date: Mon, 22 Jun 2026 11:00:30 +0530 Subject: [PATCH 2/2] test: add unit tests and docs for Trackio monitor - Add TestTrackio covering config defaults, enabled config, and write_events() logging behavior (mocks the optional trackio dependency) - Add TestMonitorMasterTrackioWiring covering MonitorMaster registration when trackio is enabled/disabled - Update docs/_tutorials/monitor.md with Trackio overview, config example, and Custom Monitoring API reference Addresses Codex review feedback on PR #8065 Signed-off-by: Pranav Chanduri --- docs/_tutorials/monitor.md | 10 +++- tests/unit/monitor/test_monitor.py | 86 +++++++++++++++++++++++++++++- 2 files changed, 93 insertions(+), 3 deletions(-) diff --git a/docs/_tutorials/monitor.md b/docs/_tutorials/monitor.md index 5e7a6fc4e834..0ca119f466d4 100644 --- a/docs/_tutorials/monitor.md +++ b/docs/_tutorials/monitor.md @@ -11,7 +11,7 @@ In this tutorial, we introduce the DeepSpeed Monitor and provide examples of its ## Overview -Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=tutorial) and simple CSV files. +Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=tutorial), [Trackio](https://github.com/huggingface/trackio) and simple CSV files. Below is a live monitoring view for TensorBoard: @@ -25,6 +25,8 @@ Below is a live monitoring view for Comet: ![CometML Example Output](/assets/images/comet_monitor.png){: .align-center} +[Trackio](https://github.com/huggingface/trackio) is a lightweight, offline-first experiment tracking library from Hugging Face with a WandB-compatible API. Runs can be visualized as an HF Space or dataset on the HF Hub. + ## Usage The DeepSpeed Monitor is configured within the deepspeed [configuration file](/docs/config-json/#monitoring-module). DeepSpeed will automatically monitor key training metrics, including those tracked with the `wall_clock_breakdown` configuration option. In addition, users can log their own custom events and metrics. @@ -54,6 +56,10 @@ When using DeepSpeed for model training, the Monitor can be configured in the De "project": "my_project", "experiment_name": "my_experiment" } + "trackio": { + "enabled": true, + "project": "my_project" + } "csv_monitor": { "enabled": true, "output_path": "output/ds_logs/", @@ -69,7 +75,7 @@ DeepSpeed will automatically log to all available and enabled monitoring backend In addition to automatic monitoring, users can log their own custom metrics in client scripts. Currently, there are two ways to initialize Monitor objects: 1. (Recommended) - Create a `MonitorMaster(ds_config.monitor_config)` object, which automatically initializes all monitor backends present in the DeepSpeed configuration -2. Create a specific `TensorBoardMonitor(ds_config.monitor_config)`, `WandbMonitor(ds_config.monitor_config)`, `csvMonitor(ds_config.monitor_config)` object which will only initialize a specific monitor backend present in the DeepSpeed configuration +2. Create a specific `TensorBoardMonitor(ds_config.monitor_config)`, `WandbMonitor(ds_config.monitor_config)`, `TrackioMonitor(ds_config.monitor_config)`, `csvMonitor(ds_config.monitor_config)` object which will only initialize a specific monitor backend present in the DeepSpeed configuration The steps to create a custom monitor are as follows: diff --git a/tests/unit/monitor/test_monitor.py b/tests/unit/monitor/test_monitor.py index d4b3cf43921d..ca085a712a41 100644 --- a/tests/unit/monitor/test_monitor.py +++ b/tests/unit/monitor/test_monitor.py @@ -3,14 +3,18 @@ # DeepSpeed Team +import sys + from deepspeed.monitor.tensorboard import TensorBoardMonitor from deepspeed.monitor.wandb import WandbMonitor from deepspeed.monitor.csv_monitor import csvMonitor from deepspeed.monitor.config import DeepSpeedMonitorConfig from deepspeed.monitor.comet import CometMonitor +from deepspeed.monitor.trackio import TrackioMonitor +from deepspeed.monitor.monitor import MonitorMaster from unit.common import DistributedTest -from unittest.mock import Mock, patch +from unittest.mock import Mock, MagicMock, patch from deepspeed.runtime.config import DeepSpeedConfig import deepspeed.comm as dist @@ -164,3 +168,83 @@ def test_empty_comet(self): assert comet_monitor.enabled == defaults.enabled assert comet_monitor.samples_log_interval == defaults.samples_log_interval mock_start.assert_not_called() + + +class TestTrackio(DistributedTest): + world_size = 2 + + def test_trackio(self): + # trackio is an optional dependency, so we stub the module rather + # than requiring it to be installed for CI. + mock_trackio = MagicMock() + + config_dict = {"train_batch_size": 2, "trackio": {"enabled": True, "project": "my_project"}} + ds_config = DeepSpeedConfig(config_dict) + + with patch.dict(sys.modules, {"trackio": mock_trackio}): + trackio_monitor = TrackioMonitor(ds_config.monitor_config.trackio) + + assert trackio_monitor.enabled == True + assert trackio_monitor.project == "my_project" + + # trackio.init should only be called on rank 0 + if dist.get_rank() == 0: + mock_trackio.init.assert_called_once_with(project="my_project") + else: + mock_trackio.init.assert_not_called() + + def test_empty_trackio(self): + mock_trackio = MagicMock() + + config_dict = {"train_batch_size": 2, "trackio": {}} + ds_config = DeepSpeedConfig(config_dict) + + with patch.dict(sys.modules, {"trackio": mock_trackio}): + trackio_monitor = TrackioMonitor(ds_config.monitor_config.trackio) + + defaults = DeepSpeedMonitorConfig().trackio + assert trackio_monitor.enabled == defaults.enabled + assert trackio_monitor.project == defaults.project + + def test_trackio_write_events(self): + # Verifies write_events() correctly converts 3-tuples into + # trackio.log() calls with the right step value. + mock_trackio = MagicMock() + + config_dict = {"train_batch_size": 2, "trackio": {"enabled": True, "project": "my_project"}} + ds_config = DeepSpeedConfig(config_dict) + + with patch.dict(sys.modules, {"trackio": mock_trackio}): + trackio_monitor = TrackioMonitor(ds_config.monitor_config.trackio) + events = [("Train/Loss", 0.5, 100)] + trackio_monitor.write_events(events) + + if dist.get_rank() == 0: + mock_trackio.log.assert_called_once_with({"Train/Loss": 0.5}, step=100) + else: + mock_trackio.log.assert_not_called() + + +class TestMonitorMasterTrackioWiring(DistributedTest): + world_size = 2 + + def test_trackio_enabled_creates_monitor(self): + mock_trackio = MagicMock() + + config_dict = {"train_batch_size": 2, "trackio": {"enabled": True, "project": "my_project"}} + ds_config = DeepSpeedConfig(config_dict) + + with patch.dict(sys.modules, {"trackio": mock_trackio}): + monitor_master = MonitorMaster(ds_config.monitor_config) + + if dist.get_rank() == 0: + assert monitor_master.trackio_monitor is not None + assert isinstance(monitor_master.trackio_monitor, TrackioMonitor) + else: + assert monitor_master.trackio_monitor is None + + def test_trackio_disabled_skips_monitor(self): + config_dict = {"train_batch_size": 2, "trackio": {"enabled": False}} + ds_config = DeepSpeedConfig(config_dict) + monitor_master = MonitorMaster(ds_config.monitor_config) + assert monitor_master.trackio_monitor is None