-
Notifications
You must be signed in to change notification settings - Fork 75
Exposes API for processing pretraining data #672
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 13 commits
fb2a1aa
6fbdcfd
fd6f5f3
09a0b90
967a3c4
dd8e363
333a8ef
1cd4aa4
fba8ce6
f57251b
a354284
699ca77
9c3905b
fa0b7d1
5f8f2a5
d7be2a3
0275455
77d39f6
554df1b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,6 @@ | |
| import logging | ||
| import os | ||
| import subprocess | ||
| import sys | ||
| import time | ||
| import warnings | ||
|
|
||
|
|
@@ -47,6 +46,7 @@ | |
| from instructlab.training.config import ( | ||
| DistributedBackend, | ||
| ModelTypes, | ||
| PretrainingConfig, | ||
| TorchrunArgs, | ||
| TrainingArgs, | ||
| ) | ||
|
|
@@ -364,6 +364,7 @@ def main(args): | |
| batch_size = args.effective_batch_size | ||
|
|
||
| pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0 | ||
|
|
||
| train_loader = get_data_loader( | ||
| data_path=args.data_path, | ||
| batch_size=batch_size, | ||
|
|
@@ -374,6 +375,7 @@ def main(args): | |
| num_workers=8, # I don't like this but am setting it for consistency | ||
| flash_enabled=flash_enabled, | ||
| pad_token_id=pad_token_id, | ||
| pretraining_config=getattr(args, "pretraining_config", None), | ||
| ) | ||
|
|
||
| if args.local_rank == 0: | ||
|
|
@@ -469,18 +471,27 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: | |
| ) | ||
|
|
||
| if train_args.process_data: | ||
| # TODO(osilkin): | ||
| # Decouple the data processing logic from training. | ||
| # Now that we've decided that repos will be less tethered to the | ||
| # design choices of the `ilab` CLI, we can make this change. | ||
| dp.process_data( | ||
| data_output_path=train_args.data_output_dir, | ||
| model_path=train_args.model_path, | ||
| data_path=train_args.data_path, | ||
| max_seq_len=train_args.max_seq_len, | ||
| chat_tmpl_path=train_args.chat_tmpl_path, | ||
| num_cpu_procs=train_args.data_process_num_cpu_procs, | ||
| ) | ||
| if train_args.pretraining_config is not None: | ||
| dp.process_documents_for_pretraining( | ||
| data_path=train_args.data_path, | ||
| data_output_path=train_args.data_output_dir, | ||
| model_path=train_args.model_path, | ||
| num_cpu_procs=train_args.data_process_num_cpu_procs, | ||
| document_column_name=train_args.pretraining_config.document_column_name, | ||
| ) | ||
| else: | ||
| # TODO(osilkin): | ||
| # Decouple the data processing logic from training. | ||
| # Now that we've decided that repos will be less tethered to the | ||
| # design choices of the `ilab` CLI, we can make this change. | ||
| dp.process_data( | ||
| data_output_path=train_args.data_output_dir, | ||
| model_path=train_args.model_path, | ||
| data_path=train_args.data_path, | ||
| max_seq_len=train_args.max_seq_len, | ||
| chat_tmpl_path=train_args.chat_tmpl_path, | ||
| num_cpu_procs=train_args.data_process_num_cpu_procs, | ||
| ) | ||
|
|
||
| if not os.path.exists(train_args.ckpt_output_dir): | ||
| os.makedirs(train_args.ckpt_output_dir, exist_ok=True) | ||
|
|
@@ -537,6 +548,12 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: | |
| ] | ||
| ) | ||
|
|
||
| if train_args.pretraining_config is not None: | ||
| command.append(f"--block-size={train_args.pretraining_config.block_size}") | ||
| command.append( | ||
| f"--document-column-name={train_args.pretraining_config.document_column_name}" | ||
| ) | ||
|
|
||
| if train_args.chat_tmpl_path is not None: | ||
| command.append(f"--chat-tmpl-path={train_args.chat_tmpl_path}") | ||
|
|
||
|
|
@@ -647,15 +664,12 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: | |
| return | ||
|
|
||
| # wait for the process to exit so we can properly read the exit code | ||
| process.wait(timeout=60) | ||
| process_code = process.poll() | ||
| failure = process_code != 0 | ||
|
|
||
| if not failure: | ||
| return_code = process.wait(timeout=60) # wait for 1 min or error | ||
| if return_code == 0: | ||
| logger.info("Operation completed successfully! 🎉") | ||
| else: | ||
| logger.error( | ||
| f"Training subprocess has not exited yet. Sending SIGTERM. Process code: {process_code}" | ||
| f"Training subprocess has not exited yet. Sending SIGTERM. Process code: {return_code}" | ||
| ) | ||
|
||
|
|
||
| process.terminate() | ||
|
|
@@ -784,6 +798,18 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: | |
| help="Which modules we should target for injecting LoRA layers. Defaults to selecting all projection layers when no values are provided.", | ||
| ) | ||
| parser.add_argument("--max_batch_len", type=int, default=60000) | ||
| parser.add_argument( | ||
| "--block-size", | ||
| type=int, | ||
| default=None, | ||
| help="When provided, enables pretraining mode with the given token block size.", | ||
| ) | ||
| parser.add_argument( | ||
| "--document-column-name", | ||
| type=str, | ||
| default=None, | ||
| help="Column name containing raw documents for continual pretraining data.", | ||
| ) | ||
| parser.add_argument( | ||
| "--cpu_offload_optimizer", | ||
| action="store_true", | ||
|
|
@@ -856,6 +882,18 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: | |
| help="Epsilon for numerical stability in AdamW optimizer.", | ||
| ) | ||
| args = parser.parse_args() | ||
| if args.document_column_name is not None and args.block_size is None: | ||
| parser.error("--document-column-name requires --block-size to be specified.") | ||
|
|
||
| if args.block_size is not None: | ||
| pretraining_kwargs = {} | ||
| if args.document_column_name is not None: | ||
| pretraining_kwargs["document_column_name"] = args.document_column_name | ||
| args.pretraining_config = PretrainingConfig( | ||
| block_size=args.block_size, **pretraining_kwargs | ||
| ) | ||
| else: | ||
| args.pretraining_config = None | ||
| set_random_seed(args.seed) | ||
| main(args) | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.