DocuFlow-AI/function.py at main · viochris/DocuFlow-AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import tempfile
import streamlit as st
from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader

def init_state():
    """
    Initializes Streamlit session state variables to persist data across reruns.
    This ensures that chat history, uploaded files, and vector stores are not lost
    when the user interacts with the UI.
    """
    # Initialize the chat message history list
    if "message" not in st.session_state:
        st.session_state.message = []

    # Track files that have already been processed to avoid re-embedding
    if "processed_files" not in st.session_state:
        st.session_state.processed_files = []

    # Store the FAISS vector database
    if "vectorstore" not in st.session_state:
        st.session_state.vectorstore = None

    # Store the conversation history for the chain memory
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []

def change_on_api_key():
    """
    Callback function triggered when the API Key input changes.
    It performs a hard reset of the session to ensure security and
    clears any previous agents initialized with the old key.
    """
    # Clear visual chat history
    st.session_state.message = []
    # Clear tracking of processed files
    st.session_state.processed_files = []

    # Remove the vector store and QA chain from memory
    st.session_state.pop("vectorstore", None)
    st.session_state.pop("qa_chain", None)

    # Notify the user
    st.toast("API Key updated. Session reset.", icon="🔄")

def reset_state():
    """
    Resets the conversation and memory manually via the Reset button.
    This clears the context but keeps the current configuration (like API Key).
    """
    # Clear visual chat history
    st.session_state.message = []
    # Clear list of processed files
    st.session_state.processed_files = []

    # Remove backend data structures (Vectorstore & Chain)
    st.session_state.pop("vectorstore", None)
    st.session_state.pop("qa_chain", None)

    # Notify the user
    st.toast("Conversation & memory cleared.", icon="🧹")

def reset_agent():
    """
    Clears the current QA chain from the session state.

    This is typically called when model parameters (like temperature or language)
    are modified, forcing the system to rebuild the agent with new settings
    on the next run.
    """
    st.session_state.pop("qa_chain", None)

def process_file(uploaded_file):
    """
    Processes an uploaded file (PDF, CSV, TXT, MD) to extract its text content.

    This function creates a temporary file on the disk to accommodate LangChain loaders,
    selects the appropriate loader based on the file extension, and handles specific
    reading errors (like encoding or encryption).

    Args:
        uploaded_file: The file object uploaded via the Streamlit widget.

    Returns:
        List[Document] | None: A list of LangChain Document objects if successful,
                               or None if the format is unsupported or an error occurs.
    """

    # Extract file name and extension
    filename = uploaded_file.name
    file_ext = os.path.splitext(filename)[-1]

    # Create a temporary file to store the uploaded binary data
    # delete=False is used to ensure compatibility across different OS (especially Windows)
    with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file:
        tmp_file.write(uploaded_file.getvalue())
        tmp_file_path = tmp_file.name

    loader = None

    try:
        # Select the appropriate loader based on file extension
        if file_ext.lower() == ".pdf" or filename.lower().endswith(".pdf"):
            loader = PyPDFLoader(tmp_file_path)

        elif file_ext.lower() == ".csv" or filename.lower().endswith(".csv"):
            # CSVs require specific encoding handling
            loader = CSVLoader(tmp_file_path, encoding="utf-8")

        elif file_ext.lower() in [".txt", ".md"] or filename.lower().endswith((".txt", ".md")):
            loader = TextLoader(tmp_file_path, encoding="utf-8")

        else:
            # Handle unsupported file types gracefully
            st.warning(f"⚠️ Unsupported File Format: '{file_ext}'. Please upload PDF, CSV, MD, or TXT files only.")
            return None

        # Execute the loading process
        return loader.load()

    # --- ADVANCED ERROR HANDLING ---
    except Exception as e:
        error_message = str(e)

        # 1. Handle Encoding Errors (Common in CSV/TXT exported from Excel)
        if "codec" in error_message or "decode" in error_message:
            st.error("🔤 **Read Error: Invalid Text Encoding**")
            st.warning(
                """
                **Explanation:** The uploaded file uses a text encoding format that is not supported (likely ANSI or ISO-8859-1).

                **Solution:** Open your file in a text editor (like Notepad) and save it specifically with **UTF-8** encoding, then try uploading again.
                """
            )

        # 2. Handle Corrupt or Password-Protected PDFs
        elif "PDF" in error_message or "EOF" in error_message:
            st.error("📄 **Read Error: Cannot Process PDF**")
            st.info("The PDF file appears to be corrupted, encrypted, or password-protected. Please ensure the file is accessible and try again.")

        # 3. Handle Generic/Unknown Errors
        else:
            st.error("⚠️ **File Processing Failed**")
            st.write(f"**Technical Details:** `{error_message}`")

        return None

    finally:
        # Cleanup: Remove the temporary file from the disk to free up space
        if os.path.exists(tmp_file_path):
            os.remove(tmp_file_path)