Refactor bucketing, disable torch.compile by default

splotnikv · splotnikv · commit 2d9a17a8a203 · 2025-05-29T15:14:07.000-07:00
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
@@ -182,7 +182,7 @@ def setup_model(
     else:
         model = AutoModelForCausalLM.from_pretrained(**base_model_args)
 
-    if is_torch_hpu_available():
+    if is_torch_hpu_available() and os.getenv("HPU_ENABLE_TORCH_COMPILE", False):
         torch._dynamo.config.cache_size_limit = int(1e4)
         torch._dynamo.config.accumulated_cache_size_limit = int(2e4)
         model = torch.compile(model, backend="hpu_backend", dynamic=False)
diff --git a/src/instructlab/training/multipack_sampler.py b/src/instructlab/training/multipack_sampler.py
@@ -70,9 +70,14 @@ def get_effective_samples_per_minibatch(num_tokens_per_gpu):
 
         The function creates a sampler using the MultipackDistributedBatchSampler class, generates batches using the sampler, and then returns the ratio of the dataset size to the number of batches.
         """
+        lengths=dataset.get_lengths()
+        if is_torch_hpu_available():
+            bucket_v = np.vectorize(bucket)
+            lengths = bucket_v(lengths)
+
         sampler = MultipackDistributedBatchSampler(
             batch_max_length=num_tokens_per_gpu,
-            lengths=dataset.get_lengths(),
+            lengths=lengths,
             num_replicas=torch.distributed.get_world_size(),
             rank=torch.distributed.get_rank(),
             seed=seed,
@@ -397,11 +402,6 @@ def generate_batches(self, set_stats=False):
             )
 
         lengths = self.lengths[indices]
-
-        if is_torch_hpu_available():
-            bucket_v = np.vectorize(bucket)
-            lengths = bucket_v(lengths)
-
         lengths_cumsum = np.cumsum(lengths)
 
         batches, total_used, total_slots = allocate(
diff --git a/src/instructlab/training/token_dataset.py b/src/instructlab/training/token_dataset.py
@@ -13,6 +13,7 @@
 from instructlab.training.multipack_sampler import MultipackDistributedBatchSampler
 from instructlab.training.utils import log_rank_0, make_collate_fn
 
+from instructlab.training.hpu_utils import is_torch_hpu_available, bucket
 
 class TokenDataset(Dataset):
     def __init__(self, data_path):
@@ -109,6 +110,10 @@ def setup_dataloader(
 
     lengths = dataset.get_lengths()
     if sampler == "multipack":
+        if is_torch_hpu_available():
+            bucket_v = np.vectorize(bucket)
+            lengths = bucket_v(lengths)
+
         sampler = MultipackDistributedBatchSampler(
             batch_max_length=packing_max_batch_len,
             lengths=lengths,