scratchgpt/examples/simple.py at main · LabStrangeLoop/scratchgpt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python3
"""
Simple example showing minimal usage of ScratchGPT to train on Darwin's "On the Origin of Species"

This script demonstrates:
1. Downloading training data from Project Gutenberg
2. Setting up a basic configuration
3. Training a small transformer model
4. Basic text generation

Usage:
    python simple.py
"""

import sys
import tempfile
from pathlib import Path
from urllib.request import urlretrieve

import torch
from torch.optim import AdamW

# Import ScratchGPT components
from scratchgpt import (
    CharTokenizer,
    ScratchGPTArchitecture,
    ScratchGPTConfig,
    ScratchGPTTraining,
    Trainer,
    TransformerLanguageModel,
)
from scratchgpt.data import create_data_source


def download_darwin_text(data_file: Path) -> None:
    """Download Darwin's 'On the Origin of Species' using Python's built-in urllib."""
    print("Downloading 'On the Origin of Species' by Charles Darwin...")
    url = "https://www.gutenberg.org/files/1228/1228-0.txt"

    try:
        urlretrieve(url, data_file)
        print(f"Downloaded data to: {data_file}")
    except Exception as e:
        print(f"Failed to download data: {e}")
        print("Please manually download the file from:")
        print(url)
        sys.exit(1)


def create_simple_config() -> ScratchGPTConfig:
    """Create a minimal configuration suitable for quick training."""
    # Small architecture for quick training on CPU/small GPU
    architecture = ScratchGPTArchitecture(
        block_size=128,
        embedding_size=256,
        num_heads=8,
        num_blocks=4,
        # vocab_size will be set based on the tokenizer
    )

    # Training config optimized for quick results
    training = ScratchGPTTraining(
        max_epochs=20,
        learning_rate=3e-4,
        batch_size=32,
        dropout_rate=0.1,
        random_seed=1337,
        iteration_type="chunking",
    )

    return ScratchGPTConfig(architecture=architecture, training=training)


def prepare_text_for_tokenizer(data_file: Path) -> str:
    """Read the text file for tokenization."""
    print(f"Reading text from: {data_file}")

    with open(data_file, encoding="utf-8") as f:
        text = f.read()

    print(f"Text length: {len(text):,} characters")
    return text


def main() -> None:
    print("ScratchGPT Simple Training Example")
    print("=" * 50)

    # Use temporary directory that auto-cleans when done
    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_path = Path(tmp_dir)
        data_file = tmp_path / "darwin_origin_species.txt"
        experiment_dir = tmp_path / "darwin_experiment"

        # Step 1: Download data
        download_darwin_text(data_file)

        # Step 2: Prepare text and create tokenizer
        text = prepare_text_for_tokenizer(data_file)
        print("Creating character-level tokenizer...")
        tokenizer = CharTokenizer(text=text)
        print(f"Vocabulary size: {tokenizer.vocab_size}")

        # Alternative: Use a pre-trained tokenizer like GPT-2
        # This requires: uv sync --extra hf-tokenizers
        #
        # from scratchgpt import HuggingFaceTokenizer
        # tokenizer = HuggingFaceTokenizer.from_hub("gpt2")
        # print(f"Vocabulary size: {tokenizer.vocab_size}")  # ~50,257 tokens
        #
        # Trade-offs:
        # - CharTokenizer: Small vocab (~100 chars), learns from scratch, simple
        # - GPT-2 Tokenizer: Large vocab (~50K tokens), pre-trained, better text quality
        # - GPT-2 tokenizer will likely generate more coherent text but requires more memory

        # Step 3: Create configuration
        config = create_simple_config()
        config.architecture.vocab_size = tokenizer.vocab_size
        print(
            f"Model configuration: {config.architecture.embedding_size}D embeddings, "
            f"{config.architecture.num_blocks} blocks, {config.architecture.num_heads} heads"
        )

        # Step 4: Setup model and training
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")

        model = TransformerLanguageModel(config)
        model = model.to(device)
        print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

        optimizer = AdamW(model.parameters(), lr=config.training.learning_rate)
        data_source = create_data_source(str(data_file))

        # Step 5: Create trainer and start training
        trainer = Trainer(
            model=model,
            config=config.training,
            optimizer=optimizer,
            experiment_path=experiment_dir,
            device=device,
        )

        print("\nStarting training...")
        print("(Press Ctrl-C to stop training early and see text generation)")

        try:
            trainer.train(data_source=data_source, tokenizer=tokenizer)
            print("\nTraining completed successfully!")
        except KeyboardInterrupt:
            print("\n\nTraining interrupted by user. Moving to text generation with current model state...")

        # Step 6: Simple text generation demo
        print("\nTesting text generation:")
        model.eval()

        test_prompts = ["Natural selection", "The origin of species", "Darwin observed"]

        for prompt in test_prompts:
            print(f"\nPrompt: '{prompt}'")
            context = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

            with torch.no_grad():
                generated = model.generate(context, max_new_tokens=100)
                result = tokenizer.decode(generated[0].tolist())
                print(f"Generated: {result}")

        print("\nAll temporary files automatically cleaned up.")
        print("Run the script again to start fresh.")


if __name__ == "__main__":
    main()