deepspeedai · delock · Jun 27, 2026 · Jun 25, 2026 · Jun 27, 2026 · chatgpt-codex-connector
@@ -35,6 +35,7 @@
 from deepspeed.runtime.constants import PIPE_REPLICATED
 from deepspeed.accelerator import get_accelerator
 from deepspeed.runtime.zero.muon.original_muon import muon_update
+from deepspeed.runtime.zero.muon.muon_optimizer import MuonWithAuxAdam
 from deepspeed.checkpoint.constants import (DS_VERSION, GROUP_PADDINGS, PARTITION_COUNT, LOSS_SCALER,
                                             SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE,
                                             BASE_OPTIMIZER_STATE_STEP, CLIP_GRAD, ZERO_STAGE, PARAM_SLICE_MAPPINGS)
@@ -218,6 +219,12 @@ def __init__(self,
 
         self.reduce_scatter = reduce_scatter
 
+        # Muon's Newton-Schulz orthogonalization needs the full all-reduced gradient on each
+        # rank; reduce_scatter delivers only this rank's partition slice and silently corrupts
+        # cross-partition parameters (#7807). ZeRO-3 already guards this (see stage3.py).
+        if isinstance(self.optimizer, MuonWithAuxAdam) and self.reduce_scatter:
+            raise ValueError("Muon and reduce scatter cannot be used together")
+
         self.overlap_comm = overlap_comm
 
         self.deepspeed_adam_offload = self.cpu_offload

@@ -177,3 +177,37 @@ def test_ns_method_stage3(self, ns_method):
             loss = engine(x, y)
             engine.backward(loss)
             engine.step()
+
+
+class TestMuonRejectsReduceScatter(DistributedTest):
+    """Muon needs the full all-reduced gradient matrix on each rank for its Newton-Schulz
+    orthogonalization. reduce_scatter only delivers each rank its own partition slice, which
+    silently corrupts cross-partition parameters in ZeRO-1/2 (#7807). Initialization must fail
+    loudly, consistent with the ZeRO-3 guard in stage3.py (added in #7919)."""
+
+    world_size = 1
+
+    @pytest.mark.parametrize('zero_stage', [1, 2])
+    def test_muon_reduce_scatter_raises(self, zero_stage):
+        config_dict = {
+            "train_batch_size": 4,
+            "optimizer": {
+                "type": "muon",
+                "params": {
+                    "lr": 0.01
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "reduce_scatter": True,
+            },
+        }
+        model = SimpleModel(hidden_dim=32, nlayers=2)
+        with pytest.raises(ValueError, match="Muon and reduce scatter cannot be used together"):
+            deepspeed.initialize(config=config_dict,
+                                 model=model,
+                                 model_parameters=model.parameters(),
+                                 dist_init_required=False)