diff --git a/.github/workflows/pypi-package.yaml b/.github/workflows/pypi-package.yaml index f0ea428..4a5923c 100644 --- a/.github/workflows/pypi-package.yaml +++ b/.github/workflows/pypi-package.yaml @@ -7,6 +7,11 @@ on: description: "Version to use for the Python package (e.g. 0.1.0)" required: true type: string + test-pypi: + description: "Publish to Test PyPI" + required: false + type: boolean + default: false release: types: [published] @@ -55,4 +60,4 @@ jobs: # Avoid workflow to fail if the version has already been published skip-existing: true # Upload to Test Pypi for testing - #repository-url: https://test.pypi.org/legacy/ + repository-url: ${{ github.event.inputs.test-pypi == 'true' && 'https://test.pypi.org/legacy/' || '' }} diff --git a/README.md b/README.md index f879a66..9c1f078 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,13 @@ # SQLite RAG -[![Run Tests](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml/badge.svg?branch=main&event=release)](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml) +[![Run Tests](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml/badge.svg)](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml) [![codecov](https://codecov.io/github/sqliteai/sqlite-rag/graph/badge.svg?token=30KYPY7864)](https://codecov.io/github/sqliteai/sqlite-rag) ![PyPI - Version](https://img.shields.io/pypi/v/sqlite-rag?link=https%3A%2F%2Fpypi.org%2Fproject%2Fsqlite-rag%2F) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/sqlite-rag?link=https%3A%2F%2Fpypi.org%2Fproject%2Fsqlite-rag) -A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqliteai/sqlite-ai) and [SQLite Vector](https://github.com/sqliteai/sqlite-vector) extensions. SQLite RAG combines vector similarity search with full-text search ([FTS5](https://www.sqlite.org/fts5.html) extension) using Reciprocal Rank Fusion (RRF) for enhanced document retrieval. +A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqliteai/sqlite-ai) and [SQLite Vector](https://github.com/sqliteai/sqlite-vector) extensions. +SQLite RAG combines vector similarity search with full-text search ([FTS5](https://www.sqlite.org/fts5.html) extension) using Reciprocal Rank Fusion (RRF) for enhanced document retrieval. ## Features @@ -20,6 +21,13 @@ A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqlit ## Installation +### Prerequisites + +SQLite RAG requires SQLite with _extension loading_ support. +If you encounter extension loading issues (e.g., `'sqlite3.Connection' object has no attribute 'enable_load_extension'`), follow the setup guides for [macOS](https://github.com/sqliteai/sqlite-extensions-guide/blob/main/platforms/macos.md#python-on-macos) or [Windows](https://github.com/sqliteai/sqlite-extensions-guide/blob/main/platforms/windows.md#using-sqlite-with-python). + +### Install SQLite RAG + ```bash python3 -m venv .venv source .venv/bin/activate # On Windows: .venv\Scripts\activate diff --git a/pyproject.toml b/pyproject.toml index 90a8c65..8bc080d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "attrs", "typer", "huggingface_hub[hf_transfer]", "markitdown[docx]", diff --git a/src/sqlite_rag/models/chunk.py b/src/sqlite_rag/models/chunk.py index 4beae4f..15bb26b 100644 --- a/src/sqlite_rag/models/chunk.py +++ b/src/sqlite_rag/models/chunk.py @@ -1,4 +1,4 @@ -from attr import dataclass +from dataclasses import dataclass @dataclass diff --git a/src/sqlite_rag/models/document.py b/src/sqlite_rag/models/document.py index 9b727e4..e8e4685 100644 --- a/src/sqlite_rag/models/document.py +++ b/src/sqlite_rag/models/document.py @@ -1,10 +1,9 @@ import hashlib import re +from dataclasses import dataclass, field from datetime import datetime from typing import Optional -from attr import dataclass - from .chunk import Chunk @@ -15,11 +14,11 @@ class Document: id: str | None = None content: str = "" uri: str | None = None - metadata: dict = {} + metadata: dict = field(default_factory=dict) created_at: datetime | None = None updated_at: datetime | None = None - chunks: list["Chunk"] = [] + chunks: list["Chunk"] = field(default_factory=list) def hash(self) -> str: """Generate a hash for the document content using SHA-3 for maximum collision resistance""" @@ -55,11 +54,11 @@ def extract_document_title(self, fallback_first_line: bool = False) -> str | Non if match: return match.group(1).strip() - # Fallback: first non-empty line + # Fallback: first non-empty line with at least one word if fallback_first_line: for line in self.content.splitlines(): line = line.strip() - if line: + if line and re.search(r"\w", line): return line[: self.GENERATED_TITLE_MAX_CHARS] return None diff --git a/src/sqlite_rag/models/document_result.py b/src/sqlite_rag/models/document_result.py index 9346b18..2a89298 100644 --- a/src/sqlite_rag/models/document_result.py +++ b/src/sqlite_rag/models/document_result.py @@ -1,4 +1,4 @@ -from attr import dataclass +from dataclasses import dataclass from .document import Document diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py index 9081196..dfebbeb 100644 --- a/src/sqlite_rag/sqliterag.py +++ b/src/sqlite_rag/sqliterag.py @@ -103,7 +103,7 @@ def add( if use_relative_paths else str(file_path.absolute()) ) - document = Document(content=content, uri=uri, metadata=metadata) + document = Document(content=content, uri=uri, metadata=metadata.copy()) exists = self._repository.document_exists_by_hash(document.hash()) if exists: @@ -132,7 +132,7 @@ def add_text( """Add a text content into the database""" self._ensure_initialized() - document = Document(content=text, uri=uri, metadata=metadata) + document = Document(content=text, uri=uri, metadata=metadata.copy()) self._engine.create_new_context() document = self._engine.process(document) diff --git a/tests/integration/test_engine.py b/tests/integration/test_engine.py index 790bfc0..9b99ff6 100644 --- a/tests/integration/test_engine.py +++ b/tests/integration/test_engine.py @@ -3,8 +3,6 @@ import pytest -from sqlite_rag.models.chunk import Chunk - class TestEngine: @pytest.mark.slow @@ -20,8 +18,8 @@ def random_string(length=30): result_chunks = {} for i in range(1000): try: - chunk = engine.generate_embeddings([Chunk(content=random_string())]) - result_chunks[chunk[0].embedding.hex()] = chunk[0] + embedding = engine.generate_embedding(random_string()) + result_chunks[embedding.hex()] = embedding assert len(result_chunks) == i + 1 except Exception as e: pytest.fail(f"Embedding generation failed on chunk {i}: {e}") diff --git a/tests/models/test_document.py b/tests/models/test_document.py index d3248cf..9da4ea1 100644 --- a/tests/models/test_document.py +++ b/tests/models/test_document.py @@ -45,3 +45,11 @@ def test_extract_document_title_without_heading( assert ( doc.extract_document_title(fallback_first_line=fallback) == expected_title ) + + def test_extract_document_title_with_a_word(self): + content = "---\n \n Leading spaces line with a word." + doc = Document(content=content, metadata={}) + assert ( + doc.extract_document_title(fallback_first_line=True) + == "Leading spaces line with a word." + ) diff --git a/tests/test_sqlite_rag.py b/tests/test_sqlite_rag.py index 5c7a3e4..f9fcac4 100644 --- a/tests/test_sqlite_rag.py +++ b/tests/test_sqlite_rag.py @@ -139,7 +139,39 @@ def test_add_file_with_metadata(self): doc = cursor.fetchone() assert doc assert doc[0] == "This is a test document with metadata." - assert doc[1] == json.dumps(metadata) + assert doc[1] == json.dumps( + { + **metadata, + "generated": {"title": "This is a test document with metadata."}, + } + ) + + def test_add_documents_with_generated_title(self): + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as doc1: + doc1.write("# Title 1\nThis is the first test document.") + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as doc2: + doc2.write("# Title 2\nThis is the second test document.") + + doc3 = "# Title 3\nThis is the third test document." + doc4 = "# Title 4\nThis is the fourth test document." + + rag = SQLiteRag.create(db_path=":memory:") + + rag.add(doc1.name) + rag.add(doc2.name) + rag.add_text(doc3) + rag.add_text(doc4) + + conn = rag._conn + cursor = conn.execute("SELECT metadata FROM documents") + docs = cursor.fetchall() + assert len(docs) == 4 + + titles = [json.loads(doc[0]).get("generated", {}).get("title") for doc in docs] + assert "Title 1" in titles + assert "Title 2" in titles + assert "Title 3" in titles + assert "Title 4" in titles def test_add_empty_file(self): with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: @@ -229,7 +261,14 @@ def test_add_text_with_metadata(self): assert doc assert doc[0] == "This is a test document content with metadata." assert doc[1] == "test_doc_with_metadata.txt" - assert doc[2] == json.dumps(metadata) + assert doc[2] == json.dumps( + { + **metadata, + "generated": { + "title": "This is a test document content with metadata." + }, + } + ) def test_list_documents(self): rag = SQLiteRag.create(":memory:")