Skip to content

Commit f84cca6

Browse files
authored
Merge branch 'master' into fix-6848-forbid-repeated-init
2 parents 1b15bea + 46c6c9e commit f84cca6

File tree

7 files changed

+22
-28
lines changed

7 files changed

+22
-28
lines changed

build_win.bat

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ set DS_BUILD_AIO=0
77
set DS_BUILD_CUTLASS_OPS=0
88
set DS_BUILD_EVOFORMER_ATTN=0
99
set DS_BUILD_FP_QUANTIZER=0
10+
set DS_BUILD_GDS=0
1011
set DS_BUILD_RAGGED_DEVICE_OPS=0
1112
set DS_BUILD_SPARSE_ATTN=0
1213

deepspeed/runtime/engine.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -799,10 +799,8 @@ def zero_load_from_fp32_weights(self):
799799
def zero_elastic_checkpoint(self):
800800
return self._config.zero_config.elastic_checkpoint
801801

802-
def zero_has_nvme_offload(self):
803-
if not hasattr(self.optimizer, "swap_optimizer"):
804-
return False
805-
return self.optimizer.swap_optimizer or self.optimizer.params_in_nvme_and_cpu
802+
def zero_nvme_offload_optimizer(self):
803+
return getattr(self.optimizer, "swap_optimizer", False)
806804

807805
def zero_max_live_parameters(self):
808806
return self._config.zero_config.max_live_parameters
@@ -2865,7 +2863,7 @@ def load_checkpoint(self,
28652863
if not success:
28662864
self.optimizer._restore_from_bit16_weights()
28672865

2868-
if self.zero_has_nvme_offload():
2866+
if self.zero_nvme_offload_optimizer():
28692867
from shutil import copytree, disk_usage
28702868
offload_dir = self.optimizer.optimizer_swapper.swap_folder
28712869
offload_ckpt_dir = os.path.join(load_dir, tag, "offloaded_tensors")
@@ -3205,7 +3203,7 @@ def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True,
32053203
self._create_zero_checkpoint_files(save_dir, tag)
32063204
self._save_zero_checkpoint(save_dir, tag)
32073205

3208-
if self.zero_has_nvme_offload():
3206+
if self.zero_nvme_offload_optimizer():
32093207
from shutil import copytree, disk_usage
32103208
offload_dir = self.optimizer.optimizer_swapper.swap_folder
32113209
offload_ckpt_dir = os.path.join(save_dir, tag, "offloaded_tensors")

deepspeed/runtime/swap_tensor/optimizer_utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,11 @@ def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_nume
153153
'timer_names',
154154
]
155155

156+
def purge_state(self):
157+
for swap_info in self.swap_params_info.values():
158+
swap_info.tensors = [swap_info.tensors[0]]
159+
swap_info.has_state_tensors = False
160+
156161
def swappable_tensor(self, param=None, numel=None):
157162
assert param is not None or numel is not None, "Either param or numel must be provided"
158163
if param is not None:

deepspeed/runtime/zero/stage3.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -546,15 +546,10 @@ def _setup_for_real_optimizer(self):
546546
self.grad_partitions_flat_buffer = get_accelerator().pin_memory(self.grad_partitions_flat_buffer)
547547

548548
offset = 0
549-
max_partition_numel = 0
550549
for param in all_params:
551550
self.__param_id_to_grad_partition[param.ds_id] = self.grad_partitions_flat_buffer.narrow(
552551
0, offset, param.partition_numel())
553552
offset += param.partition_numel()
554-
max_partition_numel = max(max_partition_numel, param.partition_numel())
555-
if self.offload_optimizer:
556-
self.pinned_grad_buffer: Tensor = get_accelerator().pin_memory(
557-
torch.empty(max_partition_numel, device=self.device))
558553

559554
def _link_all_hp_params(self):
560555
for p in self.module.parameters():
@@ -1510,13 +1505,9 @@ def partition_grads(self, params_to_release: List[Parameter], grad_partitions: L
15101505
offload_fp32_gradients[i].append(grad_buffer.float())
15111506
offload_fp32_offsets[i].append(dest_offset)
15121507
else:
1513-
buffer_numel = grad_buffer.numel()
15141508
fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow(
1515-
0, dest_offset, buffer_numel)
1516-
self.pinned_grad_buffer[:buffer_numel].copy_(
1517-
grad_buffer.to(dtype=torch.float32, non_blocking=True))
1518-
get_accelerator().synchronize()
1519-
fp32_grad_tensor.copy_(self.pinned_grad_buffer[:buffer_numel], non_blocking=True)
1509+
0, dest_offset, grad_buffer.numel())
1510+
fp32_grad_tensor.copy_(grad_buffer.float())
15201511

15211512
# free the gradient
15221513
if not get_accelerator().is_synchronized_device():
@@ -2661,11 +2652,9 @@ def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
26612652
self.optimizer.load_state_dict(state_dict[OPTIMIZER_STATE_DICT])
26622653
self._clear_fp32_optimizer_param_groups()
26632654

2664-
if self.swap_optimizer or self.params_in_nvme_and_cpu:
2655+
if self.swap_optimizer:
26652656
# Purge the swapped optimizer state, it was initialized to the freshly created model and not the checkpoint
2666-
for swap_info in self.optimizer_swapper.swap_params_info.values():
2667-
swap_info.tensors = [swap_info.tensors[0]]
2668-
swap_info.has_state_tensors = False
2657+
self.optimizer_swapper.purge_state()
26692658

26702659
if self.swap_optimizer:
26712660
# Touch all parameters to synchronize all buffers
@@ -2782,11 +2771,9 @@ def load_hp_checkpoint_state_from_checkpoint_dir_stage3(self, checkpoint_dir, pa
27822771
else:
27832772
optim_sd[OPTIMIZER_STATE_DICT]['state'][0][key] = key_tensor
27842773

2785-
if self.swap_optimizer or self.params_in_nvme_and_cpu:
2774+
if self.swap_optimizer:
27862775
# Purge the swapped optimizer state, it was initialized to the freshly created model and not the checkpoint
2787-
for swap_info in self.optimizer_swapper.swap_params_info.values():
2788-
swap_info.tensors = [swap_info.tensors[0]]
2789-
swap_info.has_state_tensors = False
2776+
self.optimizer_swapper.purge_state()
27902777

27912778
if self.swap_optimizer:
27922779
# Touch all parameters to synchronize all buffers

op_builder/builder.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ def get_default_compute_capabilities():
7676
cuda_minor_mismatch_ok = {
7777
10: ["10.0", "10.1", "10.2"],
7878
11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
79-
12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5", "12.6"],
79+
12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5", "12.6",
80+
"12.8"], # There does not appear to be a CUDA Toolkit 12.7
8081
}
8182

8283

tests/unit/runtime/zero/test_nvme_checkpointing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@ class TestNVMeCheckpointing(DistributedTest):
2222
world_size = 1
2323

2424
@pytest.mark.parametrize('param_offload_device, optim_offload_device',
25-
[(OffloadDeviceEnum.cpu, OffloadDeviceEnum.cpu),
25+
[(OffloadDeviceEnum.none, OffloadDeviceEnum.nvme),
2626
(OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme),
27+
(OffloadDeviceEnum.nvme, OffloadDeviceEnum.none),
28+
(OffloadDeviceEnum.nvme, OffloadDeviceEnum.cpu),
2729
(OffloadDeviceEnum.nvme, OffloadDeviceEnum.nvme)])
2830
def test_nvme_checkpointing(self, tmpdir, param_offload_device, optim_offload_device):
2931
zero_dir, ckpt_dir = os.path.join(tmpdir, "zero"), os.path.join(tmpdir, "checkpoint")

version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.16.3
1+
0.16.4

0 commit comments

Comments
 (0)