Skip to content

Commit a06c319

Browse files
committed
Fixes #139
1 parent 84a3b13 commit a06c319

File tree

1 file changed

+11
-10
lines changed

1 file changed

+11
-10
lines changed

src/server/utils/embedding.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Copyright (c) 2024, 2025, Oracle and/or its affiliates.
33
Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
44
"""
5-
# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs
5+
# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs, genai, hnsw
66

77
import json
88
import copy
@@ -22,7 +22,7 @@
2222
from langchain_core.language_models.chat_models import BaseChatModel
2323
from langchain.docstore.document import Document as LangchainDocument
2424
from langchain.text_splitter import RecursiveCharacterTextSplitter
25-
from langchain_text_splitters import HTMLSectionSplitter, CharacterTextSplitter
25+
from langchain_text_splitters import HTMLHeaderTextSplitter, CharacterTextSplitter
2626

2727
import server.utils.databases as databases
2828

@@ -130,20 +130,20 @@ def split_document(
130130
("h4", "Header 4"),
131131
("h5", "Header 5"),
132132
]
133-
html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)
133+
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
134134
##################################
135135
# Splitters - End
136136
##################################
137137
match extension.lower():
138138
case "pdf":
139139
doc_split = text_splitter.split_documents(document)
140140
case "html":
141-
try:
142-
html_split = html_splitter.split_documents(document)
143-
except Exception as ex:
144-
logger.exception(ex)
145-
html_split = document
146-
doc_split = text_splitter.split_documents(html_split)
141+
tmp_meta = document[0].metadata
142+
print(tmp_meta)
143+
doc_split = html_splitter.split_text(document[0].page_content)
144+
# Update metadata with source
145+
for doc in doc_split:
146+
doc.metadata.update(tmp_meta)
147147
case "pdf" | "md" | "txt" | "csv":
148148
doc_split = text_splitter.split_documents(document)
149149
case _:
@@ -180,7 +180,8 @@ def load_and_split_documents(
180180
case "pdf":
181181
loader = document_loaders.PyPDFLoader(file)
182182
case "html":
183-
loader = document_loaders.UnstructuredHTMLLoader(file)
183+
# Use TextLoader to preserve for header split
184+
loader = document_loaders.TextLoader(file)
184185
case "md":
185186
loader = document_loaders.TextLoader(file)
186187
case "csv":

0 commit comments

Comments
 (0)