-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfunction.py
More file actions
151 lines (120 loc) · 5.69 KB
/
function.py
File metadata and controls
151 lines (120 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import tempfile
import streamlit as st
from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader
def init_state():
"""
Initializes Streamlit session state variables to persist data across reruns.
This ensures that chat history, uploaded files, and vector stores are not lost
when the user interacts with the UI.
"""
# Initialize the chat message history list
if "message" not in st.session_state:
st.session_state.message = []
# Track files that have already been processed to avoid re-embedding
if "processed_files" not in st.session_state:
st.session_state.processed_files = []
# Store the FAISS vector database
if "vectorstore" not in st.session_state:
st.session_state.vectorstore = None
# Store the conversation history for the chain memory
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
def change_on_api_key():
"""
Callback function triggered when the API Key input changes.
It performs a hard reset of the session to ensure security and
clears any previous agents initialized with the old key.
"""
# Clear visual chat history
st.session_state.message = []
# Clear tracking of processed files
st.session_state.processed_files = []
# Remove the vector store and QA chain from memory
st.session_state.pop("vectorstore", None)
st.session_state.pop("qa_chain", None)
# Notify the user
st.toast("API Key updated. Session reset.", icon="🔄")
def reset_state():
"""
Resets the conversation and memory manually via the Reset button.
This clears the context but keeps the current configuration (like API Key).
"""
# Clear visual chat history
st.session_state.message = []
# Clear list of processed files
st.session_state.processed_files = []
# Remove backend data structures (Vectorstore & Chain)
st.session_state.pop("vectorstore", None)
st.session_state.pop("qa_chain", None)
# Notify the user
st.toast("Conversation & memory cleared.", icon="🧹")
def reset_agent():
"""
Clears the current QA chain from the session state.
This is typically called when model parameters (like temperature or language)
are modified, forcing the system to rebuild the agent with new settings
on the next run.
"""
st.session_state.pop("qa_chain", None)
def process_file(uploaded_file):
"""
Processes an uploaded file (PDF, CSV, TXT, MD) to extract its text content.
This function creates a temporary file on the disk to accommodate LangChain loaders,
selects the appropriate loader based on the file extension, and handles specific
reading errors (like encoding or encryption).
Args:
uploaded_file: The file object uploaded via the Streamlit widget.
Returns:
List[Document] | None: A list of LangChain Document objects if successful,
or None if the format is unsupported or an error occurs.
"""
# Extract file name and extension
filename = uploaded_file.name
file_ext = os.path.splitext(filename)[-1]
# Create a temporary file to store the uploaded binary data
# delete=False is used to ensure compatibility across different OS (especially Windows)
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
loader = None
try:
# Select the appropriate loader based on file extension
if file_ext.lower() == ".pdf" or filename.lower().endswith(".pdf"):
loader = PyPDFLoader(tmp_file_path)
elif file_ext.lower() == ".csv" or filename.lower().endswith(".csv"):
# CSVs require specific encoding handling
loader = CSVLoader(tmp_file_path, encoding="utf-8")
elif file_ext.lower() in [".txt", ".md"] or filename.lower().endswith((".txt", ".md")):
loader = TextLoader(tmp_file_path, encoding="utf-8")
else:
# Handle unsupported file types gracefully
st.warning(f"⚠️ Unsupported File Format: '{file_ext}'. Please upload PDF, CSV, MD, or TXT files only.")
return None
# Execute the loading process
return loader.load()
# --- ADVANCED ERROR HANDLING ---
except Exception as e:
error_message = str(e)
# 1. Handle Encoding Errors (Common in CSV/TXT exported from Excel)
if "codec" in error_message or "decode" in error_message:
st.error("🔤 **Read Error: Invalid Text Encoding**")
st.warning(
"""
**Explanation:** The uploaded file uses a text encoding format that is not supported (likely ANSI or ISO-8859-1).
**Solution:** Open your file in a text editor (like Notepad) and save it specifically with **UTF-8** encoding, then try uploading again.
"""
)
# 2. Handle Corrupt or Password-Protected PDFs
elif "PDF" in error_message or "EOF" in error_message:
st.error("📄 **Read Error: Cannot Process PDF**")
st.info("The PDF file appears to be corrupted, encrypted, or password-protected. Please ensure the file is accessible and try again.")
# 3. Handle Generic/Unknown Errors
else:
st.error("⚠️ **File Processing Failed**")
st.write(f"**Technical Details:** `{error_message}`")
return None
finally:
# Cleanup: Remove the temporary file from the disk to free up space
if os.path.exists(tmp_file_path):
os.remove(tmp_file_path)