The schema should be simple, normalized enough for filters, and optimized for SQLite FTS5 search. Do not store raw JSON.
The MVP database represents one imported backup root at a time. Re-running ghbb import may clear and rebuild all imported tables.
schema_meta: schema version.import_runs: one row per import.repositories: imported repositories.labels: repository labels.items: issues, pull requests, discussions, releases.item_labels: many-to-many item labels.comments: comments, reviews, review comments, discussion replies.attachments: user attachment metadata fromattachments/*/manifest.json.release_assets: release asset metadata from release JSON.search_docs: searchable documents that point to an item or comment.search_fts: FTS5 virtual table oversearch_docs.
This SQL is intentionally explicit. Adjust naming if implementation needs minor changes, but preserve the concepts.
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS schema_meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
INSERT OR IGNORE INTO schema_meta(key, value) VALUES ('schema_version', '1');
CREATE TABLE IF NOT EXISTS import_runs (
id INTEGER PRIMARY KEY,
backup_root TEXT NOT NULL,
started_at TEXT NOT NULL,
finished_at TEXT,
status TEXT NOT NULL DEFAULT 'running',
error TEXT
);
CREATE TABLE IF NOT EXISTS repositories (
id INTEGER PRIMARY KEY,
owner TEXT NOT NULL,
name TEXT NOT NULL,
full_name TEXT NOT NULL,
source_dir TEXT,
first_seen_at TEXT NOT NULL,
last_import_run_id INTEGER,
FOREIGN KEY(last_import_run_id) REFERENCES import_runs(id) ON DELETE SET NULL,
UNIQUE(owner, name),
UNIQUE(full_name)
);
CREATE TABLE IF NOT EXISTS labels (
id INTEGER PRIMARY KEY,
repo_id INTEGER NOT NULL,
name TEXT NOT NULL,
color TEXT,
description TEXT,
is_default INTEGER,
url TEXT,
FOREIGN KEY(repo_id) REFERENCES repositories(id) ON DELETE CASCADE,
UNIQUE(repo_id, name)
);
CREATE TABLE IF NOT EXISTS items (
id INTEGER PRIMARY KEY,
repo_id INTEGER NOT NULL,
kind TEXT NOT NULL CHECK(kind IN ('issue', 'pull', 'discussion', 'release')),
item_key TEXT NOT NULL,
number INTEGER,
tag_name TEXT,
title TEXT NOT NULL DEFAULT '',
body TEXT NOT NULL DEFAULT '',
body_text TEXT NOT NULL DEFAULT '',
state TEXT,
state_reason TEXT,
author TEXT,
author_association TEXT,
created_at TEXT,
updated_at TEXT,
closed_at TEXT,
published_at TEXT,
url TEXT,
comments_count INTEGER,
reactions_count INTEGER,
-- Pull-request-specific nullable metadata.
merged_at TEXT,
is_draft INTEGER,
base_ref TEXT,
head_ref TEXT,
merge_commit_sha TEXT,
-- Release-specific nullable metadata.
is_prerelease INTEGER,
is_draft_release INTEGER,
target_commitish TEXT,
source_path TEXT NOT NULL,
source_mtime INTEGER,
source_size INTEGER,
imported_at TEXT NOT NULL,
import_run_id INTEGER,
FOREIGN KEY(repo_id) REFERENCES repositories(id) ON DELETE CASCADE,
FOREIGN KEY(import_run_id) REFERENCES import_runs(id) ON DELETE SET NULL,
UNIQUE(repo_id, kind, item_key)
);
CREATE INDEX IF NOT EXISTS idx_items_repo_kind ON items(repo_id, kind);
CREATE INDEX IF NOT EXISTS idx_items_repo_kind_number ON items(repo_id, kind, number);
CREATE INDEX IF NOT EXISTS idx_items_state ON items(state);
CREATE INDEX IF NOT EXISTS idx_items_author ON items(author);
CREATE INDEX IF NOT EXISTS idx_items_updated_at ON items(updated_at);
CREATE INDEX IF NOT EXISTS idx_items_created_at ON items(created_at);
CREATE TABLE IF NOT EXISTS item_labels (
item_id INTEGER NOT NULL,
label_name TEXT NOT NULL,
FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE,
PRIMARY KEY(item_id, label_name)
);
CREATE INDEX IF NOT EXISTS idx_item_labels_name ON item_labels(label_name);
CREATE TABLE IF NOT EXISTS comments (
id INTEGER PRIMARY KEY,
item_id INTEGER NOT NULL,
kind TEXT NOT NULL CHECK(kind IN (
'issue_comment',
'pull_comment',
'pull_review',
'pull_review_comment',
'discussion_comment',
'discussion_reply'
)),
external_id TEXT,
parent_external_id TEXT,
sequence INTEGER NOT NULL,
author TEXT,
author_association TEXT,
body TEXT NOT NULL DEFAULT '',
body_text TEXT NOT NULL DEFAULT '',
state TEXT,
path TEXT,
diff_hunk TEXT,
commit_id TEXT,
created_at TEXT,
updated_at TEXT,
submitted_at TEXT,
published_at TEXT,
url TEXT,
reactions_count INTEGER,
FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE,
UNIQUE(item_id, kind, external_id)
);
CREATE INDEX IF NOT EXISTS idx_comments_item ON comments(item_id, sequence);
CREATE INDEX IF NOT EXISTS idx_comments_author ON comments(author);
CREATE INDEX IF NOT EXISTS idx_comments_created_at ON comments(created_at);
CREATE TABLE IF NOT EXISTS attachments (
id INTEGER PRIMARY KEY,
item_id INTEGER NOT NULL,
manifest_path TEXT NOT NULL,
local_path TEXT,
original_url TEXT,
original_filename TEXT,
saved_as TEXT,
content_type TEXT,
size_bytes INTEGER,
success INTEGER,
http_status INTEGER,
downloaded_at TEXT,
error TEXT,
FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_attachments_item ON attachments(item_id);
CREATE INDEX IF NOT EXISTS idx_attachments_content_type ON attachments(content_type);
CREATE TABLE IF NOT EXISTS release_assets (
id INTEGER PRIMARY KEY,
item_id INTEGER NOT NULL,
name TEXT NOT NULL,
label TEXT,
content_type TEXT,
size_bytes INTEGER,
download_count INTEGER,
state TEXT,
browser_download_url TEXT,
created_at TEXT,
updated_at TEXT,
FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE,
UNIQUE(item_id, name)
);
CREATE INDEX IF NOT EXISTS idx_release_assets_item ON release_assets(item_id);
CREATE TABLE IF NOT EXISTS search_docs (
id INTEGER PRIMARY KEY,
repo_id INTEGER NOT NULL,
item_id INTEGER NOT NULL,
comment_id INTEGER,
doc_type TEXT NOT NULL,
title TEXT NOT NULL DEFAULT '',
body TEXT NOT NULL DEFAULT '',
author TEXT,
created_at TEXT,
updated_at TEXT,
url TEXT,
FOREIGN KEY(repo_id) REFERENCES repositories(id) ON DELETE CASCADE,
FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE,
FOREIGN KEY(comment_id) REFERENCES comments(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_search_docs_repo ON search_docs(repo_id);
CREATE INDEX IF NOT EXISTS idx_search_docs_item ON search_docs(item_id);
CREATE INDEX IF NOT EXISTS idx_search_docs_comment ON search_docs(comment_id);
CREATE INDEX IF NOT EXISTS idx_search_docs_type ON search_docs(doc_type);
-- Token characters should make code-ish terms easier to search:
-- ggml_backend_cuda, llama.cpp, C++, file/path.c, issue-123, #include.
-- The exact tokenizer string may need small quote escaping changes in Python.
CREATE VIRTUAL TABLE IF NOT EXISTS search_fts USING fts5(
title,
body,
author,
content='search_docs',
content_rowid='id',
tokenize='unicode61 remove_diacritics 2 tokenchars ''_-./+#:@'''
);item_key avoids nullable uniqueness issues.
Use:
- issue: decimal issue number as text, e.g.
'1234' - pull: decimal PR number as text, e.g.
'10001' - discussion: decimal discussion number as text, e.g.
'32' - release: tag name, e.g.
'b1046','v0.10.0'
For issue/PR/discussion, also fill number as integer. For releases, fill tag_name and leave number null.
Some GitHub discussion JSON contains both Markdown-ish body and plain text bodyText.
Use:
body: best display source, usually Markdown.body_text: best plain text source for search snippets.
For issues, PRs, and releases, body_text can equal body for MVP.
FTS should index body_text when available, falling back to body.
For MVP, rebuild the entire FTS index after import.
Algorithm:
- Delete existing FTS data and search docs.
- Insert one search doc for every item with non-empty title/body.
- Insert one search doc for every comment/review/reply with non-empty body.
- Insert matching rows into
search_ftsusing thesearch_docs.idasrowid.
Pseudo-code:
def rebuild_fts(conn):
conn.execute("DELETE FROM search_fts")
conn.execute("DELETE FROM search_docs")
for item in conn.execute("SELECT ... FROM items"):
doc_id = insert_search_doc(...)
conn.execute(
"INSERT INTO search_fts(rowid, title, body, author) VALUES (?, ?, ?, ?)",
(doc_id, item.title, item.body_text or item.body, item.author or ""),
)
for comment in conn.execute("SELECT ... FROM comments JOIN items ..."):
doc_id = insert_search_doc(...)
conn.execute(
"INSERT INTO search_fts(rowid, title, body, author) VALUES (?, ?, ?, ?)",
(doc_id, item.title, comment.body_text or comment.body, comment.author or ""),
)If DELETE FROM search_fts behaves unexpectedly for external-content FTS, use:
INSERT INTO search_fts(search_fts) VALUES('delete-all');Or drop and recreate the virtual table during rebuild.
Base search SQL should return document hits joined to parent item and repo:
SELECT
sd.id AS search_doc_id,
sd.doc_type,
sd.comment_id,
r.owner,
r.name AS repo_name,
r.full_name,
i.id AS item_id,
i.kind,
i.item_key,
i.number,
i.tag_name,
i.title,
i.state,
i.author AS item_author,
i.created_at AS item_created_at,
i.updated_at AS item_updated_at,
sd.author AS hit_author,
sd.created_at AS hit_created_at,
sd.url AS hit_url,
bm25(search_fts, 8.0, 1.0, 0.5) AS rank,
snippet(search_fts, 0, '[', ']', ' ... ', 12) AS title_snippet,
snippet(search_fts, 1, '[', ']', ' ... ', 32) AS body_snippet
FROM search_fts
JOIN search_docs sd ON sd.id = search_fts.rowid
JOIN items i ON i.id = sd.item_id
JOIN repositories r ON r.id = sd.repo_id
WHERE search_fts MATCH ?
ORDER BY rank ASC
LIMIT ? OFFSET ?;Add optional filters by appending predicates and parameters:
- repo filter:
(r.name = ? OR r.full_name = ?) - kind filter:
i.kind = ? - state filter:
i.state = ? - author filter:
(i.author = ? OR sd.author = ?) - label filter:
EXISTS (SELECT 1 FROM item_labels il WHERE il.item_id = i.id AND il.label_name = ?)
Search JSON should not expose SQLite internals only. Include both IDs and human keys:
{
"query": "KV cache",
"limit": 20,
"offset": 0,
"results": [
{
"search_doc_id": 123,
"item_id": 45,
"comment_id": null,
"repo": "ggml-org/llama.cpp",
"repo_name": "llama.cpp",
"kind": "issue",
"number": 1234,
"item_key": "1234",
"title": "...",
"state": "open",
"doc_type": "item",
"hit_author": "...",
"url": "https://github.com/...",
"snippet": "..."
}
]
}Not required for MVP, but likely useful later:
eventsfor issue/PR timeline events.pull_commitsfor PR commit metadata and commit-message search.milestonesand normalized milestone references.userstable for author metadata.repo_statsmaterialized counts.- Incremental import hashes.