Skip to content

Latest commit

 

History

History
390 lines (328 loc) · 11.4 KB

File metadata and controls

390 lines (328 loc) · 11.4 KB

SQLite Schema Plan

The schema should be simple, normalized enough for filters, and optimized for SQLite FTS5 search. Do not store raw JSON.

The MVP database represents one imported backup root at a time. Re-running ghbb import may clear and rebuild all imported tables.

Tables overview

  • schema_meta: schema version.
  • import_runs: one row per import.
  • repositories: imported repositories.
  • labels: repository labels.
  • items: issues, pull requests, discussions, releases.
  • item_labels: many-to-many item labels.
  • comments: comments, reviews, review comments, discussion replies.
  • attachments: user attachment metadata from attachments/*/manifest.json.
  • release_assets: release asset metadata from release JSON.
  • search_docs: searchable documents that point to an item or comment.
  • search_fts: FTS5 virtual table over search_docs.

Initial SQL

This SQL is intentionally explicit. Adjust naming if implementation needs minor changes, but preserve the concepts.

PRAGMA foreign_keys = ON;

CREATE TABLE IF NOT EXISTS schema_meta (
    key TEXT PRIMARY KEY,
    value TEXT NOT NULL
);

INSERT OR IGNORE INTO schema_meta(key, value) VALUES ('schema_version', '1');

CREATE TABLE IF NOT EXISTS import_runs (
    id INTEGER PRIMARY KEY,
    backup_root TEXT NOT NULL,
    started_at TEXT NOT NULL,
    finished_at TEXT,
    status TEXT NOT NULL DEFAULT 'running',
    error TEXT
);

CREATE TABLE IF NOT EXISTS repositories (
    id INTEGER PRIMARY KEY,
    owner TEXT NOT NULL,
    name TEXT NOT NULL,
    full_name TEXT NOT NULL,
    source_dir TEXT,
    first_seen_at TEXT NOT NULL,
    last_import_run_id INTEGER,
    FOREIGN KEY(last_import_run_id) REFERENCES import_runs(id) ON DELETE SET NULL,
    UNIQUE(owner, name),
    UNIQUE(full_name)
);

CREATE TABLE IF NOT EXISTS labels (
    id INTEGER PRIMARY KEY,
    repo_id INTEGER NOT NULL,
    name TEXT NOT NULL,
    color TEXT,
    description TEXT,
    is_default INTEGER,
    url TEXT,
    FOREIGN KEY(repo_id) REFERENCES repositories(id) ON DELETE CASCADE,
    UNIQUE(repo_id, name)
);

CREATE TABLE IF NOT EXISTS items (
    id INTEGER PRIMARY KEY,
    repo_id INTEGER NOT NULL,
    kind TEXT NOT NULL CHECK(kind IN ('issue', 'pull', 'discussion', 'release')),
    item_key TEXT NOT NULL,
    number INTEGER,
    tag_name TEXT,
    title TEXT NOT NULL DEFAULT '',
    body TEXT NOT NULL DEFAULT '',
    body_text TEXT NOT NULL DEFAULT '',
    state TEXT,
    state_reason TEXT,
    author TEXT,
    author_association TEXT,
    created_at TEXT,
    updated_at TEXT,
    closed_at TEXT,
    published_at TEXT,
    url TEXT,
    comments_count INTEGER,
    reactions_count INTEGER,

    -- Pull-request-specific nullable metadata.
    merged_at TEXT,
    is_draft INTEGER,
    base_ref TEXT,
    head_ref TEXT,
    merge_commit_sha TEXT,

    -- Release-specific nullable metadata.
    is_prerelease INTEGER,
    is_draft_release INTEGER,
    target_commitish TEXT,

    source_path TEXT NOT NULL,
    source_mtime INTEGER,
    source_size INTEGER,
    imported_at TEXT NOT NULL,
    import_run_id INTEGER,

    FOREIGN KEY(repo_id) REFERENCES repositories(id) ON DELETE CASCADE,
    FOREIGN KEY(import_run_id) REFERENCES import_runs(id) ON DELETE SET NULL,
    UNIQUE(repo_id, kind, item_key)
);

CREATE INDEX IF NOT EXISTS idx_items_repo_kind ON items(repo_id, kind);
CREATE INDEX IF NOT EXISTS idx_items_repo_kind_number ON items(repo_id, kind, number);
CREATE INDEX IF NOT EXISTS idx_items_state ON items(state);
CREATE INDEX IF NOT EXISTS idx_items_author ON items(author);
CREATE INDEX IF NOT EXISTS idx_items_updated_at ON items(updated_at);
CREATE INDEX IF NOT EXISTS idx_items_created_at ON items(created_at);

CREATE TABLE IF NOT EXISTS item_labels (
    item_id INTEGER NOT NULL,
    label_name TEXT NOT NULL,
    FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE,
    PRIMARY KEY(item_id, label_name)
);

CREATE INDEX IF NOT EXISTS idx_item_labels_name ON item_labels(label_name);

CREATE TABLE IF NOT EXISTS comments (
    id INTEGER PRIMARY KEY,
    item_id INTEGER NOT NULL,
    kind TEXT NOT NULL CHECK(kind IN (
        'issue_comment',
        'pull_comment',
        'pull_review',
        'pull_review_comment',
        'discussion_comment',
        'discussion_reply'
    )),
    external_id TEXT,
    parent_external_id TEXT,
    sequence INTEGER NOT NULL,
    author TEXT,
    author_association TEXT,
    body TEXT NOT NULL DEFAULT '',
    body_text TEXT NOT NULL DEFAULT '',
    state TEXT,
    path TEXT,
    diff_hunk TEXT,
    commit_id TEXT,
    created_at TEXT,
    updated_at TEXT,
    submitted_at TEXT,
    published_at TEXT,
    url TEXT,
    reactions_count INTEGER,
    FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE,
    UNIQUE(item_id, kind, external_id)
);

CREATE INDEX IF NOT EXISTS idx_comments_item ON comments(item_id, sequence);
CREATE INDEX IF NOT EXISTS idx_comments_author ON comments(author);
CREATE INDEX IF NOT EXISTS idx_comments_created_at ON comments(created_at);

CREATE TABLE IF NOT EXISTS attachments (
    id INTEGER PRIMARY KEY,
    item_id INTEGER NOT NULL,
    manifest_path TEXT NOT NULL,
    local_path TEXT,
    original_url TEXT,
    original_filename TEXT,
    saved_as TEXT,
    content_type TEXT,
    size_bytes INTEGER,
    success INTEGER,
    http_status INTEGER,
    downloaded_at TEXT,
    error TEXT,
    FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE
);

CREATE INDEX IF NOT EXISTS idx_attachments_item ON attachments(item_id);
CREATE INDEX IF NOT EXISTS idx_attachments_content_type ON attachments(content_type);

CREATE TABLE IF NOT EXISTS release_assets (
    id INTEGER PRIMARY KEY,
    item_id INTEGER NOT NULL,
    name TEXT NOT NULL,
    label TEXT,
    content_type TEXT,
    size_bytes INTEGER,
    download_count INTEGER,
    state TEXT,
    browser_download_url TEXT,
    created_at TEXT,
    updated_at TEXT,
    FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE,
    UNIQUE(item_id, name)
);

CREATE INDEX IF NOT EXISTS idx_release_assets_item ON release_assets(item_id);

CREATE TABLE IF NOT EXISTS search_docs (
    id INTEGER PRIMARY KEY,
    repo_id INTEGER NOT NULL,
    item_id INTEGER NOT NULL,
    comment_id INTEGER,
    doc_type TEXT NOT NULL,
    title TEXT NOT NULL DEFAULT '',
    body TEXT NOT NULL DEFAULT '',
    author TEXT,
    created_at TEXT,
    updated_at TEXT,
    url TEXT,
    FOREIGN KEY(repo_id) REFERENCES repositories(id) ON DELETE CASCADE,
    FOREIGN KEY(item_id) REFERENCES items(id) ON DELETE CASCADE,
    FOREIGN KEY(comment_id) REFERENCES comments(id) ON DELETE CASCADE
);

CREATE INDEX IF NOT EXISTS idx_search_docs_repo ON search_docs(repo_id);
CREATE INDEX IF NOT EXISTS idx_search_docs_item ON search_docs(item_id);
CREATE INDEX IF NOT EXISTS idx_search_docs_comment ON search_docs(comment_id);
CREATE INDEX IF NOT EXISTS idx_search_docs_type ON search_docs(doc_type);

-- Token characters should make code-ish terms easier to search:
-- ggml_backend_cuda, llama.cpp, C++, file/path.c, issue-123, #include.
-- The exact tokenizer string may need small quote escaping changes in Python.
CREATE VIRTUAL TABLE IF NOT EXISTS search_fts USING fts5(
    title,
    body,
    author,
    content='search_docs',
    content_rowid='id',
    tokenize='unicode61 remove_diacritics 2 tokenchars ''_-./+#:@'''
);

Notes on items.item_key

item_key avoids nullable uniqueness issues.

Use:

  • issue: decimal issue number as text, e.g. '1234'
  • pull: decimal PR number as text, e.g. '10001'
  • discussion: decimal discussion number as text, e.g. '32'
  • release: tag name, e.g. 'b1046', 'v0.10.0'

For issue/PR/discussion, also fill number as integer. For releases, fill tag_name and leave number null.

body vs body_text

Some GitHub discussion JSON contains both Markdown-ish body and plain text bodyText.

Use:

  • body: best display source, usually Markdown.
  • body_text: best plain text source for search snippets.

For issues, PRs, and releases, body_text can equal body for MVP.

FTS should index body_text when available, falling back to body.

FTS rebuild strategy

For MVP, rebuild the entire FTS index after import.

Algorithm:

  1. Delete existing FTS data and search docs.
  2. Insert one search doc for every item with non-empty title/body.
  3. Insert one search doc for every comment/review/reply with non-empty body.
  4. Insert matching rows into search_fts using the search_docs.id as rowid.

Pseudo-code:

def rebuild_fts(conn):
    conn.execute("DELETE FROM search_fts")
    conn.execute("DELETE FROM search_docs")

    for item in conn.execute("SELECT ... FROM items"):
        doc_id = insert_search_doc(...)
        conn.execute(
            "INSERT INTO search_fts(rowid, title, body, author) VALUES (?, ?, ?, ?)",
            (doc_id, item.title, item.body_text or item.body, item.author or ""),
        )

    for comment in conn.execute("SELECT ... FROM comments JOIN items ..."):
        doc_id = insert_search_doc(...)
        conn.execute(
            "INSERT INTO search_fts(rowid, title, body, author) VALUES (?, ?, ?, ?)",
            (doc_id, item.title, comment.body_text or comment.body, comment.author or ""),
        )

If DELETE FROM search_fts behaves unexpectedly for external-content FTS, use:

INSERT INTO search_fts(search_fts) VALUES('delete-all');

Or drop and recreate the virtual table during rebuild.

Search query shape

Base search SQL should return document hits joined to parent item and repo:

SELECT
    sd.id AS search_doc_id,
    sd.doc_type,
    sd.comment_id,
    r.owner,
    r.name AS repo_name,
    r.full_name,
    i.id AS item_id,
    i.kind,
    i.item_key,
    i.number,
    i.tag_name,
    i.title,
    i.state,
    i.author AS item_author,
    i.created_at AS item_created_at,
    i.updated_at AS item_updated_at,
    sd.author AS hit_author,
    sd.created_at AS hit_created_at,
    sd.url AS hit_url,
    bm25(search_fts, 8.0, 1.0, 0.5) AS rank,
    snippet(search_fts, 0, '[', ']', ' ... ', 12) AS title_snippet,
    snippet(search_fts, 1, '[', ']', ' ... ', 32) AS body_snippet
FROM search_fts
JOIN search_docs sd ON sd.id = search_fts.rowid
JOIN items i ON i.id = sd.item_id
JOIN repositories r ON r.id = sd.repo_id
WHERE search_fts MATCH ?
ORDER BY rank ASC
LIMIT ? OFFSET ?;

Add optional filters by appending predicates and parameters:

  • repo filter: (r.name = ? OR r.full_name = ?)
  • kind filter: i.kind = ?
  • state filter: i.state = ?
  • author filter: (i.author = ? OR sd.author = ?)
  • label filter: EXISTS (SELECT 1 FROM item_labels il WHERE il.item_id = i.id AND il.label_name = ?)

Stable JSON output concepts

Search JSON should not expose SQLite internals only. Include both IDs and human keys:

{
  "query": "KV cache",
  "limit": 20,
  "offset": 0,
  "results": [
    {
      "search_doc_id": 123,
      "item_id": 45,
      "comment_id": null,
      "repo": "ggml-org/llama.cpp",
      "repo_name": "llama.cpp",
      "kind": "issue",
      "number": 1234,
      "item_key": "1234",
      "title": "...",
      "state": "open",
      "doc_type": "item",
      "hit_author": "...",
      "url": "https://github.com/...",
      "snippet": "..."
    }
  ]
}

Future schema additions

Not required for MVP, but likely useful later:

  • events for issue/PR timeline events.
  • pull_commits for PR commit metadata and commit-message search.
  • milestones and normalized milestone references.
  • users table for author metadata.
  • repo_stats materialized counts.
  • Incremental import hashes.