-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchunking.py
More file actions
38 lines (31 loc) · 1008 Bytes
/
chunking.py
File metadata and controls
38 lines (31 loc) · 1008 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
def chunk_text(md_path: str):
"""
Split documents' text into chunks.
"""
with open(md_path, "r", encoding="utf-8") as f:
markdown_text = f.read()
pages = re.split(r"\n## Page \d+\n", markdown_text)
pages = [p.strip() for p in pages if p.strip()]
splitter = RecursiveCharacterTextSplitter(
chunk_size=600,
chunk_overlap=25,
is_separator_regex=False,
length_function=len,
)
results = []
chunk_index = 0
for page_number, page_text in enumerate(pages, start=1):
chunks = splitter.split_text(page_text)
for chunk in chunks:
results.append({
"text": chunk,
"metadata": {
"source": md_path,
"page_number": page_number,
"chunk_index": chunk_index
}
})
chunk_index += 1
return results