intel · xin3he · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/auto_round/compressors/shard_writer.py b/auto_round/compressors/shard_writer.py
@@ -142,6 +142,11 @@ def _flush_shard(self):
         if self.use_safetensors:
             from safetensors.torch import save_file
 
+            # Ensure tensors are contiguous in-place to avoid duplicating them in a separate dict,
+            # which can increase peak RAM usage during saving.
+            for k, v in list(self.current_shard_tensors.items()):
+                if isinstance(v, torch.Tensor) and not v.is_contiguous():
+                    self.current_shard_tensors[k] = v.contiguous()
             save_file(self.current_shard_tensors, tmp_path)
         else:
             torch.save(self.current_shard_tensors, tmp_path)

diff --git a/test/test_cuda/requirements_vllm.txt b/test/test_cuda/requirements_vllm.txt
@@ -1,2 +1,3 @@
 vllm
 lm_eval >= 0.4.10
+ray