Skip to content

Commit 2d7c62c

Browse files
committed
Update_[18_Aug_2025]_[SHUtils]
1 parent fe38acd commit 2d7c62c

6 files changed

Lines changed: 446 additions & 27 deletions

File tree

README.md

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,53 @@
11
# Python Language Feature Set
22

3-
## Basic features
3+
## Basic Features
44

5-
- [Hello world](com/inbravo/core/HelloWorld.py)
6-
- [Hello world using Jupitor Notebook](com/inbravo/core/HelloWorld.ipynb)
7-
- [Data types](com/inbravo/core/DataTypeTest.py)
8-
- [Variable types](com/inbravo/core/VariableTest.py)
9-
- [Why intendation matters in Python](com/inbravo/core/IntendationTest.py)
10-
- [Main function](com/inbravo/core/MainFunctionTest.py)
5+
- [Hello World](com/inbravo/core/HelloWorld.py)
6+
- [Hello World using Jupyter Notebook](com/inbravo/core/HelloWorld.ipynb)
7+
- [Data Types](com/inbravo/core/DataTypeTest.py)
8+
- [Variable Types](com/inbravo/core/VariableTest.py)
9+
- [Why Indentation Matters in Python](com/inbravo/core/IntendationTest.py)
10+
- [Main Function](com/inbravo/core/MainFunctionTest.py)
1111

12-
## Data structures
12+
## File Operations
13+
14+
- [SHUtils Examples ("shell utilities", sh standing for Shell)](com/inbravo/file/SHUtil_Test.py)
15+
- [Get File Metadata](com/inbravo/file/File_Meta_Data.py)
16+
- [Get Count of Files in a Folder](com/inbravo/file/File_Count.py)
17+
18+
## Data Structures
1319

1420
- [Tuples](com/inbravo/core/TupleTest.py)
1521
- [Sets](com/inbravo/core/SetTest.py)
1622

17-
## String handling
23+
## String Handling
1824

1925
- [Formatted Strings](com/inbravo/string/FString.py)
2026

21-
## Regular expressions
27+
## Regular Expressions
2228

23-
- [Regular expressions based string splitting](com/inbravo/regexp/Reg_Exp_Utils.py)
29+
- [Regular Expressions Based String Splitting](com/inbravo/regexp/Reg_Exp_Utils.py)
2430

2531
## System
2632

27-
- [Operating system information](com/inbravo/system/OSInfo.py)
28-
- [Find the library version in environment](com/inbravo/system/LibVersion.py)
33+
- [Operating System Information](com/inbravo/system/OSInfo.py)
34+
- [Find the Library Version in Environment](com/inbravo/system/LibVersion.py)
2935

30-
## Matlib
36+
## Matplotlib
3137

32-
- [Create a two dimentional graph](com/inbravo/matplot/Graph_Test.py)
38+
- [Create a Two-Dimensional Graph](com/inbravo/matplot/Graph_Test.py)
3339

3440
## PySpark
3541

36-
- [Calculate gross income of a super marker](com/inbravo/dbx/super-market/Gross_Income.ipynb)
42+
- [Calculate Gross Income of a Supermarket](com/inbravo/dbx/super-market/Gross_Income.ipynb)
3743

38-
## VENV (Optional)
44+
## Virtual Environment (VENV) (Optional)
3945

4046
1. Install Python: `brew install python@3.11`
4147
2. Install PIP: `pip3.11 install uv`
42-
3. Create VENV in the downloaded codebase: `python3.11 -m venv .venv`
43-
4. uv pip install -r requirements.txt
44-
5. Activate VENV: `source .venv/bin/activate`
48+
3. Create a virtual environment in the downloaded codebase: `python3.11 -m venv .venv`
49+
4. Install dependencies: `pip install -r requirements.txt`
50+
5. Activate the virtual environment: `source .venv/bin/activate`
4551

4652
## License
4753

com/inbravo/.DS_Store

0 Bytes
Binary file not shown.

com/inbravo/file/SHUtil_Test.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import os
2+
import shutil
3+
4+
# This script demonstrates various file operations using the shutil module in Python.
5+
# Directory and files operations
6+
# Platform-dependent efficient copy operations
7+
# copytree example
8+
# rmtree example
9+
# Archiving operations
10+
# Archiving example
11+
# Archiving example with base_dir
12+
# Querying the size of the output terminal
13+
14+
# Create a directory
15+
os.mkdir('example_dir')
16+
17+
# Create a file in the directory
18+
with open('example_dir/example_file.txt', 'w', encoding='utf-8') as f:
19+
f.write('This is an example file.')
20+
21+
# Copy a file
22+
shutil.copy('example_dir/example_file.txt', 'example_dir/copied_file.txt')
23+
24+
# Copy a directory
25+
shutil.copytree('example_dir', 'example_dir_copy')
26+
27+
# Move a file
28+
shutil.move('example_dir/copied_file.txt', 'example_dir/moved_file.txt')
29+
30+
# Rename a file
31+
os.rename('example_dir/moved_file.txt', 'example_dir/renamed_file.txt')
32+
33+
# Archive a directory (creates a zip file)
34+
shutil.make_archive('example_dir_archive', 'zip', 'example_dir')
35+
36+
# Extract the archive
37+
shutil.unpack_archive('example_dir_archive.zip', 'extracted_dir')
38+
39+
# Remove a directory tree
40+
shutil.rmtree('example_dir')
41+
shutil.rmtree('example_dir_copy')
42+
shutil.rmtree('extracted_dir')
43+
44+
# Remove the archive
45+
os.remove('example_dir_archive.zip')
46+
47+
print("All shutil operations completed successfully.")

com/inbravo/llm/dataloader.ipynb

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "6e2a4891-c257-4d6b-afb3-e8fef39d0437",
6+
"metadata": {},
7+
"source": [
8+
"<table style=\"width:100%\">\n",
9+
"<tr>\n",
10+
"<td style=\"vertical-align:middle; text-align:left;\">\n",
11+
"<font size=\"2\">\n",
12+
"Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
13+
"<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
14+
"</font>\n",
15+
"</td>\n",
16+
"<td style=\"vertical-align:middle; text-align:left;\">\n",
17+
"<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
18+
"</td>\n",
19+
"</tr>\n",
20+
"</table>\n"
21+
]
22+
},
23+
{
24+
"cell_type": "markdown",
25+
"id": "6f678e62-7bcb-4405-86ae-dce94f494303",
26+
"metadata": {},
27+
"source": [
28+
"# The Main Data Loading Pipeline Summarized"
29+
]
30+
},
31+
{
32+
"cell_type": "markdown",
33+
"id": "070000fc-a7b7-4c56-a2c0-a938d413a790",
34+
"metadata": {},
35+
"source": [
36+
"The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).\n",
37+
"\n",
38+
"This notebook contains the main takeaway, the data loading pipeline without the intermediate steps."
39+
]
40+
},
41+
{
42+
"cell_type": "markdown",
43+
"id": "2b4e8f2d-cb81-41a3-8780-a70b382e18ae",
44+
"metadata": {},
45+
"source": [
46+
"Packages that are being used in this notebook:"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": 3,
52+
"id": "c7ed6fbe-45ac-40ce-8ea5-4edb212565e1",
53+
"metadata": {},
54+
"outputs": [
55+
{
56+
"name": "stdout",
57+
"output_type": "stream",
58+
"text": [
59+
"torch version: 2.8.0\n",
60+
"tiktoken version: 0.11.0\n"
61+
]
62+
}
63+
],
64+
"source": [
65+
"# NBVAL_SKIP\n",
66+
"from importlib.metadata import version\n",
67+
"\n",
68+
"print(\"torch version:\", version(\"torch\"))\n",
69+
"print(\"tiktoken version:\", version(\"tiktoken\"))"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": null,
75+
"id": "0ed4b7db-3b47-4fd3-a4a6-5f4ed5dd166e",
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"import tiktoken\n",
80+
"import torch\n",
81+
"import os\n",
82+
"import urllib.request\n",
83+
"from torch.utils.data import Dataset, DataLoader\n",
84+
"\n",
85+
"\n",
86+
"class GPTDatasetV1(Dataset):\n",
87+
" def __init__(self, txt, tokenizer, max_length, stride):\n",
88+
" self.input_ids = []\n",
89+
" self.target_ids = []\n",
90+
"\n",
91+
" # Tokenize the entire text\n",
92+
" token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
93+
"\n",
94+
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
95+
" for i in range(0, len(token_ids) - max_length, stride):\n",
96+
" input_chunk = token_ids[i:i + max_length]\n",
97+
" target_chunk = token_ids[i + 1: i + max_length + 1]\n",
98+
" self.input_ids.append(torch.tensor(input_chunk))\n",
99+
" self.target_ids.append(torch.tensor(target_chunk))\n",
100+
"\n",
101+
" def __len__(self):\n",
102+
" return len(self.input_ids)\n",
103+
"\n",
104+
" def __getitem__(self, idx):\n",
105+
" return self.input_ids[idx], self.target_ids[idx]\n",
106+
"\n",
107+
"\n",
108+
"def create_dataloader_v1(txt, batch_size, max_length, stride,\n",
109+
" shuffle=True, drop_last=True, num_workers=0):\n",
110+
" # Initialize the tokenizer\n",
111+
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
112+
"\n",
113+
" # Create dataset\n",
114+
" dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
115+
"\n",
116+
" # Create dataloader\n",
117+
" dataloader = DataLoader(\n",
118+
" dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n",
119+
"\n",
120+
" return dataloader\n",
121+
"\n",
122+
"# Download the text file if it does not exist \n",
123+
"if not os.path.exists(\"the-verdict.txt\"):\n",
124+
" print(\"Downloading the-verdict.txt...\")\n",
125+
" url = (\"https://raw.githubusercontent.com/rasbt/\"\n",
126+
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
127+
" \"the-verdict.txt\")\n",
128+
" file_path = \"the-verdict.txt\"\n",
129+
" urllib.request.urlretrieve(url, file_path)\n",
130+
"\n",
131+
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
132+
" raw_text = f.read()\n",
133+
"\n",
134+
"vocab_size = 50257\n",
135+
"output_dim = 256\n",
136+
"context_length = 1024\n",
137+
"\n",
138+
"\n",
139+
"token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
140+
"pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)\n",
141+
"\n",
142+
"batch_size = 8\n",
143+
"max_length = 4\n",
144+
"dataloader = create_dataloader_v1(\n",
145+
" raw_text,\n",
146+
" batch_size=batch_size,\n",
147+
" max_length=max_length,\n",
148+
" stride=max_length\n",
149+
")"
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"execution_count": null,
155+
"id": "664397bc-6daa-4b88-90aa-e8fc1fbd5846",
156+
"metadata": {},
157+
"outputs": [],
158+
"source": [
159+
"for batch in dataloader:\n",
160+
" x, y = batch\n",
161+
"\n",
162+
" token_embeddings = token_embedding_layer(x)\n",
163+
" pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n",
164+
"\n",
165+
" input_embeddings = token_embeddings + pos_embeddings\n",
166+
"\n",
167+
" break"
168+
]
169+
},
170+
{
171+
"cell_type": "code",
172+
"execution_count": null,
173+
"id": "d3664332-e6bb-447e-8b96-203aafde8b24",
174+
"metadata": {},
175+
"outputs": [],
176+
"source": [
177+
"print(input_embeddings.shape)"
178+
]
179+
}
180+
],
181+
"metadata": {
182+
"kernelspec": {
183+
"display_name": "Python 3 (ipykernel)",
184+
"language": "python",
185+
"name": "python3"
186+
},
187+
"language_info": {
188+
"codemirror_mode": {
189+
"name": "ipython",
190+
"version": 3
191+
},
192+
"file_extension": ".py",
193+
"mimetype": "text/x-python",
194+
"name": "python",
195+
"nbconvert_exporter": "python",
196+
"pygments_lexer": "ipython3",
197+
"version": "3.11.13"
198+
}
199+
},
200+
"nbformat": 4,
201+
"nbformat_minor": 5
202+
}

requirements.txt

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
matplotlib >= 3.10 # com.inbravo.matplot
22
pandas >= 2.2.1 # com.inbravo.pandas
3-
torch >= 2.3.0 # all
4-
jupyterlab >= 4.0 # all
5-
tiktoken >= 0.5.1 # ch02; ch04; ch05
6-
matplotlib >= 3.7.1 # ch04; ch06; ch07
7-
tensorflow >= 2.18.0 # ch05; ch06; ch07
8-
tqdm >= 4.66.1 # ch05; ch07
3+
torch >= 2.3.0 # com.inbravo.llm
4+
jupyterlab >= 4.0 # com.inbravo.llm
5+
tiktoken >= 0.5.1 # com.inbravo.llm
6+
tensorflow >= 2.18.0 # com.inbravo.llm
7+
tqdm >= 4.66.1 # com.inbravo.llm
98
numpy >= 1.26, < 2.1 # dependency of several other libraries like torch and pandas
10-
psutil >= 5.9.5 # ch07; already installed automatically as dependency of torch
9+
psutil >= 5.9.5 # com.inbravo.llm; as dependency of torch

0 commit comments

Comments
 (0)