Merge pull request #318 from oracle/33-crawl-html

ldemarchis · web-flow · commit 60600c63b29f · 2025-10-30T12:43:36.000+01:00
33 crawl html - Basic Web Crawling functionality integrated
diff --git a/.gitignore b/.gitignore
@@ -82,4 +82,4 @@ spring_ai/target/**
 spring_ai/create_user.sql
 spring_ai/drop.sql
 src/client/spring_ai/target/classes/*
-api_server_key
+api_server_key
diff --git a/src/server/api/utils/embed.py b/src/server/api/utils/embed.py
@@ -176,6 +176,8 @@ def load_and_split_documents(
             case "png" | "jpg" | "jpeg":
                 loader = UnstructuredImageLoader(file)
                 split = False
+            case "txt":
+                loader = document_loaders.TextLoader(file)
             case _:
                 raise ValueError(f"{extension} is not a supported file extension")
 
diff --git a/src/server/api/utils/webscrape.py b/src/server/api/utils/webscrape.py
@@ -0,0 +1,139 @@
+from bs4 import BeautifulSoup, Comment
+import re, unicodedata
+from typing import List, Dict, Tuple
+import aiohttp
+
+BAD_CHUNKS = [
+    "nav","header","footer","aside","form","menu","breadcrumb","toc","pagination",
+    "subscribe","advert","ads","promo","social","share","comment","related","widget",
+    "modal","banner","cookie","newsletter","disclaimer"
+]
+
+def normalize_ws(s: str) -> str:
+    s = unicodedata.normalize("NFKC", s)
+    s = re.sub(r"\s+", " ", s)
+    return s.strip()
+
+def clean_soup(soup: BeautifulSoup) -> None:
+    for el in soup(["script","style","noscript","template","svg","canvas","iframe"]):
+        el.decompose()
+    for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
+        c.extract()
+    for tag in soup.find_all(True):
+        ident = " ".join([
+            (tag.get("id") or ""),
+            " ".join(tag.get("class") or []),
+            (tag.get("role") or "")
+        ]).lower()
+        if any(b in ident for b in BAD_CHUNKS):
+            tag.decompose()
+
+def heading_level(tag) -> int:
+    return int(tag.name[1])
+
+def group_by_sections(soup):
+    sections = []
+    for section in soup.find_all(['section', 'article']):
+        # Use the first heading if present for section title
+        heading = section.find(re.compile('^h[1-6]$'))
+        title = normalize_ws(heading.get_text()) if heading else ""
+        paragraphs = []
+        for p in section.find_all('p'):
+            txt = normalize_ws(p.get_text())
+            if txt:
+                paragraphs.append(txt)
+        if paragraphs:
+            # All paragraphs in the section are joined with blanklines; change as you prefer
+            sections.append({"title": title, "content": "\n\n".join(paragraphs)})
+    return sections
+
+def table_to_markdown(table):
+    # Simple HTML table to Markdown converter
+    rows = []
+    for tr in table.find_all("tr"):
+        cols = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
+        rows.append(cols)
+    # Make Markdown
+    md = ""
+    if rows:
+        md += "| " + " | ".join(rows[0]) + " |\n"
+        md += "| " + " | ".join("---" for _ in rows[0]) + " |\n"
+        for row in rows[1:]:
+            md += "| " + " | ".join(row) + " |\n"
+    return md
+
+def group_by_headings(soup):
+    grouped = []
+    # Find all headings
+    for hdr in soup.find_all(re.compile("^h[1-6]$")):
+        title = normalize_ws(hdr.get_text())
+        buffer = []
+        # Find next siblings until another heading of this or higher level
+        for sib in hdr.find_next_siblings():
+            if sib.name and re.match(r"^h[1-6]$", sib.name, re.I):
+                if int(sib.name[1]) <= int(hdr.name[1]):
+                    break
+            if sib.name == "p":
+                text = normalize_ws(sib.get_text())
+                if text:
+                    buffer.append(text)
+            elif sib.name in ("ul", "ol"):
+                for li in sib.find_all('li'):
+                    text = normalize_ws(li.get_text())
+                    if text:
+                        buffer.append("• " + text)
+        if buffer:
+            grouped.append({"title": title, "content": "\n\n".join(buffer)})
+    return grouped
+
+def sections_to_markdown(sections: List[Dict]) -> str:
+    lines: List[str] = []
+    for s in sections:
+        hashes = "#" * max(1, min(6, s["level"]))
+        lines.append(f"{hashes} {s['title']}")
+        for p in s["paragraphs"]:
+            lines.append(p)
+            lines.append("")
+    out = "\n".join(lines).strip()
+    return out + "\n" if out else out
+
+def slugify(text: str, max_len: int = 80) -> str:
+    text = unicodedata.normalize("NFKD", text)
+    text = re.sub(r"[^\w\s-]", "", text).strip().lower()
+    text = re.sub(r"[\s_-]+", "-", text)
+    return text[:max_len] or "page"
+    
+async def fetch_and_extract_paragraphs(url):
+    paragraphs = []
+    async with aiohttp.ClientSession() as session:
+        async with session.get(str(url)) as response:
+            html = await response.text()
+            soup = BeautifulSoup(html, 'html.parser')
+            
+            for script in soup(["script", "style"]):
+                script.decompose()
+            for element in soup(text=lambda text: isinstance(text, Comment)):
+                element.extract()
+            
+            for p in soup.find_all("p"):
+                txt = normalize_ws(p.get_text())
+                if txt:  
+                    paragraphs.append(txt)
+    return paragraphs
+
+async def fetch_and_extract_sections(url):
+    async with aiohttp.ClientSession() as session:
+        async with session.get(str(url)) as response:
+            html = await response.text()
+            soup = BeautifulSoup(html, 'html.parser')
+            
+            for script in soup(["script", "style"]):
+                script.decompose()
+            for element in soup(text=lambda text: isinstance(text, Comment)):
+                element.extract()
+            
+            # Prefer by section, or fallback to headings
+            chunks = group_by_sections(soup)
+            if not chunks:
+                chunks = group_by_headings(soup)
+    return chunks
diff --git a/src/server/api/v1/embed.py b/src/server/api/v1/embed.py
@@ -12,12 +12,13 @@
 from fastapi import APIRouter, HTTPException, Response, Header, UploadFile
 from fastapi.responses import JSONResponse
 from pydantic import HttpUrl
-import requests
+import aiohttp
 
 import server.api.utils.oci as utils_oci
 import server.api.utils.databases as utils_databases
 import server.api.utils.embed as utils_embed
 import server.api.utils.models as utils_models
+import server.api.utils.webscrape as web_parse
 
 from common import functions, schema, logging_config
 
@@ -76,26 +77,39 @@ async def store_web_file(
     logger.debug("Received store_web_file - request: %s", request)
     temp_directory = utils_embed.get_temp_directory(client, "embedding")
 
-    # Save the file temporarily
-    for url in request:
-        filename = Path(urlparse(str(url)).path).name
-        request_timeout = 60
-        logger.debug("Requesting: %s (timeout in %is)", url, request_timeout)
-        response = requests.get(url, timeout=request_timeout)
-        content_type = response.headers.get("Content-Type", "").lower()
-
-        if "application/pdf" in content_type or "application/octet-stream" in content_type:
-            with open(temp_directory / filename, "wb") as file:
-                file.write(response.content)
-        elif "text" in content_type or "html" in content_type:
-            with open(temp_directory / filename, "w", encoding="utf-8") as file:
-                file.write(response.text)
-        else:
-            shutil.rmtree(temp_directory)
-            raise HTTPException(
-                status_code=500,
-                detail=f"Unprocessable content type: {content_type}.",
-            )
+    async with aiohttp.ClientSession() as session:
+        for url in request:
+            filename = Path(urlparse(str(url)).path).name
+            request_timeout = aiohttp.ClientTimeout(total=60)
+            logger.debug("Requesting: %s (timeout in %is)", url, request_timeout)
+            async with session.get(str(url), timeout=request_timeout) as response:
+                content_type = response.headers.get("Content-Type", "").lower()
+
+                if "application/pdf" in content_type or "application/octet-stream" in content_type:
+                    with open(temp_directory / filename, "wb") as file:
+                        file.write(await response.read())
+                
+                elif "text" in content_type or "html" in content_type:
+                    sections = await web_parse.fetch_and_extract_sections(url)
+                    base = web_parse.slugify(str(url).split('/')[-1]) or "page"
+                    out_files = []
+                    for idx, sec in enumerate(sections, 1):
+                        # filename includes section number and optional slugified title for clarity
+                        stub = web_parse.slugify(sec.get("title", "")) or f"{base}-section{idx}"
+                        sec_filename = f"{stub}.txt"
+                        sec_path = temp_directory / sec_filename
+                        with open(sec_path, "w", encoding="utf-8", errors="replace") as f:
+                            if sec.get("title"):
+                                f.write(sec["title"].strip() + "\n\n")
+                            f.write(str(sec["content"]).strip())
+                        out_files.append(sec_filename)
+
+                else:
+                    shutil.rmtree(temp_directory)
+                    raise HTTPException(
+                        status_code=500,
+                        detail=f"Unprocessable content type: {content_type}.",
+                    )
 
     stored_files = [f.name for f in temp_directory.iterdir() if f.is_file()]
     return Response(content=json.dumps(stored_files), media_type="application/json")