Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/pypi-package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ on:
description: "Version to use for the Python package (e.g. 0.1.0)"
required: true
type: string
test-pypi:
description: "Publish to Test PyPI"
required: false
type: boolean
default: false
release:
types: [published]

Expand Down Expand Up @@ -55,4 +60,4 @@ jobs:
# Avoid workflow to fail if the version has already been published
skip-existing: true
# Upload to Test Pypi for testing
#repository-url: https://test.pypi.org/legacy/
repository-url: ${{ github.event.inputs.test-pypi == 'true' && 'https://test.pypi.org/legacy/' || '' }}
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

# SQLite RAG

[![Run Tests](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml/badge.svg?branch=main&event=release)](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml)
[![Run Tests](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml/badge.svg)](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml)
[![codecov](https://codecov.io/github/sqliteai/sqlite-rag/graph/badge.svg?token=30KYPY7864)](https://codecov.io/github/sqliteai/sqlite-rag)
![PyPI - Version](https://img.shields.io/pypi/v/sqlite-rag?link=https%3A%2F%2Fpypi.org%2Fproject%2Fsqlite-rag%2F)
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/sqlite-rag?link=https%3A%2F%2Fpypi.org%2Fproject%2Fsqlite-rag)

A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqliteai/sqlite-ai) and [SQLite Vector](https://github.com/sqliteai/sqlite-vector) extensions. SQLite RAG combines vector similarity search with full-text search ([FTS5](https://www.sqlite.org/fts5.html) extension) using Reciprocal Rank Fusion (RRF) for enhanced document retrieval.
A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqliteai/sqlite-ai) and [SQLite Vector](https://github.com/sqliteai/sqlite-vector) extensions.
SQLite RAG combines vector similarity search with full-text search ([FTS5](https://www.sqlite.org/fts5.html) extension) using Reciprocal Rank Fusion (RRF) for enhanced document retrieval.

## Features

Expand All @@ -20,6 +21,13 @@ A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqlit

## Installation

### Prerequisites

SQLite RAG requires SQLite with _extension loading_ support.
If you encounter extension loading issues (e.g., `'sqlite3.Connection' object has no attribute 'enable_load_extension'`), follow the setup guides for [macOS](https://github.com/sqliteai/sqlite-extensions-guide/blob/main/platforms/macos.md#python-on-macos) or [Windows](https://github.com/sqliteai/sqlite-extensions-guide/blob/main/platforms/windows.md#using-sqlite-with-python).

### Install SQLite RAG

```bash
python3 -m venv .venv
source .venv/bin/activate # On Windows: .venv\Scripts\activate
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ classifiers = [
"Operating System :: OS Independent",
]
dependencies = [
"attrs",
"typer",
"huggingface_hub[hf_transfer]",
"markitdown[docx]",
Expand Down
2 changes: 1 addition & 1 deletion src/sqlite_rag/models/chunk.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from attr import dataclass
from dataclasses import dataclass


@dataclass
Expand Down
11 changes: 5 additions & 6 deletions src/sqlite_rag/models/document.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import hashlib
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional

from attr import dataclass

from .chunk import Chunk


Expand All @@ -15,11 +14,11 @@ class Document:
id: str | None = None
content: str = ""
uri: str | None = None
metadata: dict = {}
metadata: dict = field(default_factory=dict)
created_at: datetime | None = None
updated_at: datetime | None = None

chunks: list["Chunk"] = []
chunks: list["Chunk"] = field(default_factory=list)

def hash(self) -> str:
"""Generate a hash for the document content using SHA-3 for maximum collision resistance"""
Expand Down Expand Up @@ -55,11 +54,11 @@ def extract_document_title(self, fallback_first_line: bool = False) -> str | Non
if match:
return match.group(1).strip()

# Fallback: first non-empty line
# Fallback: first non-empty line with at least one word
if fallback_first_line:
for line in self.content.splitlines():
line = line.strip()
if line:
if line and re.search(r"\w", line):
return line[: self.GENERATED_TITLE_MAX_CHARS]

return None
2 changes: 1 addition & 1 deletion src/sqlite_rag/models/document_result.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from attr import dataclass
from dataclasses import dataclass

from .document import Document

Expand Down
4 changes: 2 additions & 2 deletions src/sqlite_rag/sqliterag.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def add(
if use_relative_paths
else str(file_path.absolute())
)
document = Document(content=content, uri=uri, metadata=metadata)
document = Document(content=content, uri=uri, metadata=metadata.copy())

exists = self._repository.document_exists_by_hash(document.hash())
if exists:
Expand Down Expand Up @@ -132,7 +132,7 @@ def add_text(
"""Add a text content into the database"""
self._ensure_initialized()

document = Document(content=text, uri=uri, metadata=metadata)
document = Document(content=text, uri=uri, metadata=metadata.copy())

self._engine.create_new_context()
document = self._engine.process(document)
Expand Down
6 changes: 2 additions & 4 deletions tests/integration/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

import pytest

from sqlite_rag.models.chunk import Chunk


class TestEngine:
@pytest.mark.slow
Expand All @@ -20,8 +18,8 @@ def random_string(length=30):
result_chunks = {}
for i in range(1000):
try:
chunk = engine.generate_embeddings([Chunk(content=random_string())])
result_chunks[chunk[0].embedding.hex()] = chunk[0]
embedding = engine.generate_embedding(random_string())
result_chunks[embedding.hex()] = embedding
assert len(result_chunks) == i + 1
except Exception as e:
pytest.fail(f"Embedding generation failed on chunk {i}: {e}")
Expand Down
8 changes: 8 additions & 0 deletions tests/models/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,11 @@ def test_extract_document_title_without_heading(
assert (
doc.extract_document_title(fallback_first_line=fallback) == expected_title
)

def test_extract_document_title_with_a_word(self):
content = "---\n \n Leading spaces line with a word."
doc = Document(content=content, metadata={})
assert (
doc.extract_document_title(fallback_first_line=True)
== "Leading spaces line with a word."
)
43 changes: 41 additions & 2 deletions tests/test_sqlite_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,39 @@ def test_add_file_with_metadata(self):
doc = cursor.fetchone()
assert doc
assert doc[0] == "This is a test document with metadata."
assert doc[1] == json.dumps(metadata)
assert doc[1] == json.dumps(
{
**metadata,
"generated": {"title": "This is a test document with metadata."},
}
)

def test_add_documents_with_generated_title(self):
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as doc1:
doc1.write("# Title 1\nThis is the first test document.")
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as doc2:
doc2.write("# Title 2\nThis is the second test document.")

doc3 = "# Title 3\nThis is the third test document."
doc4 = "# Title 4\nThis is the fourth test document."

rag = SQLiteRag.create(db_path=":memory:")

rag.add(doc1.name)
rag.add(doc2.name)
rag.add_text(doc3)
rag.add_text(doc4)

conn = rag._conn
cursor = conn.execute("SELECT metadata FROM documents")
docs = cursor.fetchall()
assert len(docs) == 4

titles = [json.loads(doc[0]).get("generated", {}).get("title") for doc in docs]
assert "Title 1" in titles
assert "Title 2" in titles
assert "Title 3" in titles
assert "Title 4" in titles

def test_add_empty_file(self):
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
Expand Down Expand Up @@ -229,7 +261,14 @@ def test_add_text_with_metadata(self):
assert doc
assert doc[0] == "This is a test document content with metadata."
assert doc[1] == "test_doc_with_metadata.txt"
assert doc[2] == json.dumps(metadata)
assert doc[2] == json.dumps(
{
**metadata,
"generated": {
"title": "This is a test document content with metadata."
},
}
)

def test_list_documents(self):
rag = SQLiteRag.create(":memory:")
Expand Down