diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..d11268e --- /dev/null +++ b/.env.example @@ -0,0 +1,11 @@ +# Use one backend (or set TOCIFY_BACKEND=openai|cursor to force). + +# OpenAI: easiest for most users — just set this and run. +OPENAI_API_KEY= +OPENAI_MODEL=gpt-4o-mini + +# Cursor CLI: needs `agent` on PATH and this key. +CURSOR_API_KEY= + +# Optional: openai | cursor (default: auto from which key is set) +# TOCIFY_BACKEND= diff --git a/.github/workflows/weekly-digest-cursor.yml b/.github/workflows/weekly-digest-cursor.yml new file mode 100644 index 0000000..2d82e73 --- /dev/null +++ b/.github/workflows/weekly-digest-cursor.yml @@ -0,0 +1,60 @@ +name: Weekly ToC Digest (Cursor) + +on: + schedule: + # Mondays 08:00 America/Los_Angeles ≈ 16:00 UTC (adjust if you like) + - cron: "00 16 * * 1" + workflow_dispatch: + +permissions: + contents: write + +jobs: + digest: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set Python version + run: echo "PYTHON_VERSION=$(cat .python-version)" >> $GITHUB_ENV + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + enable-cache: true + activate-environment: true + + - name: Install deps + run: uv sync + + - name: Install Cursor CLI + run: | + curl https://cursor.com/install -fsS | bash + echo "$HOME/.cursor/bin" >> $GITHUB_PATH + + - name: Run digest + env: + TOCIFY_BACKEND: "cursor" + CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }} + HTTP_PROXY: "" + HTTPS_PROXY: "" + ALL_PROXY: "" + NO_PROXY: "api.openai.com" + MIN_SCORE_READ: "0.35" + LOOKBACK_DAYS: "7" + SUMMARY_MAX_CHARS: "500" + PREFILTER_KEEP_TOP: "200" + BATCH_SIZE: "50" + run: | + export PATH="$HOME/.cursor/bin:$PATH" + uv run python digest.py + + - name: Commit digest.md + run: | + git config user.name "toc-digest-bot" + git config user.email "toc-digest-bot@users.noreply.github.com" + git add digest.md + git commit -m "Update weekly ToC digest" || exit 0 + git push diff --git a/.github/workflows/weekly-digest.yml b/.github/workflows/weekly-digest.yml index b97c0cf..f7c4563 100644 --- a/.github/workflows/weekly-digest.yml +++ b/.github/workflows/weekly-digest.yml @@ -1,4 +1,4 @@ -name: Weekly ToC Digest +name: Weekly ToC Digest (OpenAI) on: schedule: @@ -16,30 +16,40 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Setup Python - uses: actions/setup-python@v5 + - name: Set Python version + run: echo "PYTHON_VERSION=$(cat .python-version)" >> $GITHUB_ENV + + - name: Install uv + uses: astral-sh/setup-uv@v4 with: - python-version: "3.11" + python-version: ${{ env.PYTHON_VERSION }} + enable-cache: true + activate-environment: true - name: Install deps - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - pip install --upgrade openai httpx certifi + run: uv sync - name: Network check (OpenAI) run: | - python - << 'PY' + uv run python - << 'PY' import socket host = "api.openai.com" print("Resolving:", host) print(socket.gethostbyname(host)) print("OK: DNS resolve") PY - curl -I https://api.openai.com/v1/models --max-time 20 + curl -I https://api.openai.com/v1/models --max-time 20 || true + + - name: Show proxy-related env (debug) + run: | + echo "HTTP_PROXY=$HTTP_PROXY" + echo "HTTPS_PROXY=$HTTPS_PROXY" + echo "ALL_PROXY=$ALL_PROXY" + echo "NO_PROXY=$NO_PROXY" - name: Run digest env: + TOCIFY_BACKEND: "openai" OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} HTTP_PROXY: "" HTTPS_PROXY: "" @@ -50,7 +60,7 @@ jobs: SUMMARY_MAX_CHARS: "500" PREFILTER_KEEP_TOP: "200" BATCH_SIZE: "50" - run: python digest.py + run: uv run python digest.py - name: Commit digest.md run: | diff --git a/.gitignore b/.gitignore index b7faf40..8e1ab69 100644 --- a/.gitignore +++ b/.gitignore @@ -200,8 +200,13 @@ cython_debug/ # refer to https://docs.cursor.com/context/ignore-files .cursorignore .cursorindexingignore +.cursor/ # Marimo marimo/_static/ marimo/_lsp/ __marimo__/ + +# uv +uv.lock +pyproject.toml diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..2c07333 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/README.md b/README.md index 9d9a305..d882596 100644 --- a/README.md +++ b/README.md @@ -1,55 +1,62 @@ -# tocify — Weekly Journal ToC Digest (RSS → OpenAI → `digest.md`) +# tocify — Weekly Journal ToC Digest (RSS → triage → `digest.md`) This repo runs a GitHub Action once a week (or on-demand) that: 1. pulls new items from a list of journal RSS feeds -2. uses OpenAI to triage which items match your research interests +2. triages items against your research interests (OpenAI API or Cursor CLI) 3. writes a ranked digest to `digest.md` and commits it back to the repo It’s meant to be forked and customized. -This was almost entirely vibe-coded as an exercise (I'm pleased at how well it works!) - --- ## What’s in this repo -- **`digest.py`** — the pipeline (fetch RSS → filter → OpenAI triage → render markdown) -- **`feeds.txt`** — RSS feed list (supports comments; optionally supports `Name | URL`) -- **`interests.md`** — your keywords + narrative seed (used for relevance) -- **`prompt.txt`** — the prompt template (easy to tune without editing Python) +- **`digest.py`** — pipeline (fetch RSS → filter → triage → render markdown) +- **`integrations/`** — optional Cursor CLI triage backend (default: in-file OpenAI in digest.py) +- **`feeds.txt`** — RSS feed list (comments; optional `Name | URL`) +- **`interests.md`** — keywords + narrative (used for relevance) +- **`prompt.txt`** — prompt template (used by OpenAI and Cursor backends) - **`digest.md`** — generated output (auto-updated) -- **`.github/workflows/weekly-digest.yml`** — scheduled GitHub Action runner +- **`.github/workflows/weekly-digest.yml`** — scheduled GitHub Action - **`requirements.txt`** — Python dependencies +- **`.python-version`** — pinned Python version (used by uv, pyenv, etc.) --- -## Quick start (fork + run) +## Environment + +Python version is pinned in **`.python-version`** (e.g. `3.11`). The repo supports **[uv](https://docs.astral.sh/uv/)** for fast, reproducible installs: -### 1) Fork the repo -- Click **Fork** on GitHub to copy this repo into your account. +```bash +# Install uv (https://docs.astral.sh/uv/getting-started/installation/), then: +uv venv +uv pip install -r requirements.txt +uv run python digest.py +``` -### 2) Enable OpenAI billing / credits -The OpenAI API requires an active billing setup or credits. -- Go to the OpenAI Platform and ensure billing is enabled and/or credits are available. -- If you see errors like `insufficient_quota` or `You exceeded your current quota`, this is the cause. -- I recommend putting in spending limits. This uses very little compute, but it's nice to be careful. +Alternatively use pip and a venv as usual; the GitHub workflow uses uv and reads `.python-version`. + +--- -### 3) Create an OpenAI API key -Create an API key in the OpenAI Platform and copy it. +## Quick start (layperson: OpenAI) -**Important:** never commit this key to the repo. +1. **Fork** the repo. +2. Set **`OPENAI_API_KEY`** (get one from platform.openai.com). Never commit it. +3. Locally: copy `.env.example` to `.env`, add your key, run `python digest.py`. +4. For GitHub Actions: add secret **`OPENAI_API_KEY`** in Settings → Secrets. The workflow will use it; no CLI needed. -### 4) Add the API key as a GitHub Actions secret -In your forked repo: -- Go to **Settings → Secrets and variables → Actions** -- Click **New repository secret** -- Name: `OPENAI_API_KEY` -- Value: paste your OpenAI API key +## Quick start (Cursor CLI) -That’s it—GitHub will inject it into the workflow at runtime. +1. **Fork** the repo. +2. Install the Cursor CLI and set **`CURSOR_API_KEY`** (Cursor settings). +3. For GitHub Actions: add secret **`CURSOR_API_KEY`** and keep the workflow’s Cursor install step. + +Backend is auto-chosen from which key is set, or set **`TOCIFY_BACKEND=openai`** or **`cursor`** to force. + +--- -### 5) Configure your feeds +## Configure your feeds Edit **`feeds.txt`**. You can use comments: diff --git a/digest.py b/digest.py index d1a6655..659737c 100644 --- a/digest.py +++ b/digest.py @@ -1,14 +1,13 @@ -import os, re, json, time, math, hashlib +import os, re, math, hashlib from datetime import datetime, timezone, timedelta import feedparser -import httpx from dateutil import parser as dtparser -from openai import OpenAI, APITimeoutError, APIConnectionError, RateLimitError +from dotenv import load_dotenv +load_dotenv() # ---- config (env-tweakable) ---- -MODEL = os.getenv("OPENAI_MODEL", "gpt-4o") MAX_ITEMS_PER_FEED = int(os.getenv("MAX_ITEMS_PER_FEED", "50")) MAX_TOTAL_ITEMS = int(os.getenv("MAX_TOTAL_ITEMS", "400")) LOOKBACK_DAYS = int(os.getenv("LOOKBACK_DAYS", "7")) @@ -19,34 +18,6 @@ MIN_SCORE_READ = float(os.getenv("MIN_SCORE_READ", "0.65")) MAX_RETURNED = int(os.getenv("MAX_RETURNED", "40")) -SCHEMA = { - "type": "object", - "additionalProperties": False, - "properties": { - "week_of": {"type": "string"}, - "notes": {"type": "string"}, - "ranked": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": False, - "properties": { - "id": {"type": "string"}, - "title": {"type": "string"}, - "link": {"type": "string"}, - "source": {"type": "string"}, - "published_utc": {"type": ["string", "null"]}, - "score": {"type": "number"}, - "why": {"type": "string"}, - "tags": {"type": "array", "items": {"type": "string"}}, - }, - "required": ["id", "title", "link", "source", "published_utc", "score", "why", "tags"], - }, - }, - }, - "required": ["week_of", "notes", "ranked"], -} - # ---- tiny helpers ---- def load_feeds(path: str) -> list[dict]: @@ -83,12 +54,6 @@ def load_feeds(path: str) -> list[dict]: def read_text(path: str) -> str: with open(path, "r", encoding="utf-8") as f: return f.read() - -def load_prompt_template(path: str = "prompt.txt") -> str: - if not os.path.exists(path): - raise RuntimeError("prompt.txt not found in repo root") - with open(path, "r", encoding="utf-8") as f: - return f.read() def sha1(s: str) -> str: return hashlib.sha1(s.encode("utf-8")).hexdigest() @@ -181,53 +146,9 @@ def hits(it): return matched[:keep_top] -# ---- openai ---- -def make_openai_client() -> OpenAI: - key = os.environ.get("OPENAI_API_KEY", "").strip() - if not key.startswith("sk-"): - raise RuntimeError("OPENAI_API_KEY missing/invalid (expected to start with 'sk-').") - http_client = httpx.Client( - timeout=httpx.Timeout(connect=30.0, read=300.0, write=30.0, pool=30.0), - http2=False, - trust_env=False, - headers={"Connection": "close", "Accept-Encoding": "gzip"}, - ) - return OpenAI(api_key=key, http_client=http_client) - -def call_openai_triage(client: OpenAI, interests: dict, items: list[dict]) -> dict: - lean_items = [{ - "id": it["id"], - "source": it["source"], - "title": it["title"], - "link": it["link"], - "published_utc": it.get("published_utc"), - "summary": (it.get("summary") or "")[:SUMMARY_MAX_CHARS], - } for it in items] - - template = load_prompt_template() - - prompt = ( - template - .replace("{{KEYWORDS}}", json.dumps(interests["keywords"], ensure_ascii=False)) - .replace("{{NARRATIVE}}", interests["narrative"]) - .replace("{{ITEMS}}", json.dumps(lean_items, ensure_ascii=False)) - ) - - last = None - for attempt in range(6): - try: - resp = client.responses.create( - model=MODEL, - input=prompt, - text={"format": {"type": "json_schema", "name": "weekly_toc_digest", "schema": SCHEMA, "strict": True}}, - ) - return json.loads(resp.output_text) - except (APITimeoutError, APIConnectionError, RateLimitError) as e: - last = e - time.sleep(min(60, 2 ** attempt)) - raise last - -def triage_in_batches(client: OpenAI, interests: dict, items: list[dict], batch_size: int) -> dict: +# ---- triage (backend-agnostic batch loop) ---- +def triage_in_batches(interests: dict, items: list[dict], batch_size: int, triage_fn) -> dict: + """triage_fn(interests, batch) -> dict with keys notes, ranked (and optionally week_of).""" week_of = datetime.now(timezone.utc).date().isoformat() total = math.ceil(len(items) / batch_size) all_ranked, notes_parts = [], [] @@ -235,7 +156,7 @@ def triage_in_batches(client: OpenAI, interests: dict, items: list[dict], batch_ for i in range(0, len(items), batch_size): batch = items[i:i + batch_size] print(f"Triage batch {i // batch_size + 1}/{total} ({len(batch)} items)") - res = call_openai_triage(client, interests, batch) + res = triage_fn(interests, batch) if res.get("notes", "").strip(): notes_parts.append(res["notes"].strip()) all_ranked.extend(res.get("ranked", [])) @@ -308,9 +229,10 @@ def main(): print(f"Sending {len(items)} RSS items to model (post-filter)") items_by_id = {it["id"]: it for it in items} - client = make_openai_client() - result = triage_in_batches(client, interests, items, batch_size=BATCH_SIZE) + from integrations import get_triage_backend + triage_fn = get_triage_backend() + result = triage_in_batches(interests, items, BATCH_SIZE, triage_fn) md = render_digest_md(result, items_by_id) with open("digest.md", "w", encoding="utf-8") as f: diff --git a/integrations/__init__.py b/integrations/__init__.py new file mode 100644 index 0000000..7bf41cf --- /dev/null +++ b/integrations/__init__.py @@ -0,0 +1,38 @@ +"""Triage backends by architecture. Dispatch via TOCIFY_BACKEND; add new backends by registering here.""" + +import os + + +def _openai_backend(): + from integrations import openai_triage + + client = openai_triage.make_openai_client() + return lambda interests, items: openai_triage.call_openai_triage(client, interests, items) + + +def _cursor_backend(): + from integrations import cursor_cli + + if not cursor_cli.is_available(): + raise RuntimeError("Cursor backend requested but CURSOR_API_KEY is not set.") + return cursor_cli.call_cursor_triage + + +# Registry: TOCIFY_BACKEND value -> callable that returns (interests, items) -> dict +_BACKENDS = { + "openai": _openai_backend, + "cursor": _cursor_backend, +} + + +def get_triage_backend(): + """Return a callable (interests, items) -> dict with keys notes, ranked (and optionally week_of).""" + backend = os.getenv("TOCIFY_BACKEND", "").strip().lower() + if not backend: + backend = "cursor" if os.getenv("CURSOR_API_KEY", "").strip() else "openai" + if backend not in _BACKENDS: + raise RuntimeError( + f"Unknown TOCIFY_BACKEND={backend!r}. Known: {list(_BACKENDS)}. " + "Set OPENAI_API_KEY or CURSOR_API_KEY for default backend." + ) + return _BACKENDS[backend]() diff --git a/integrations/_shared.py b/integrations/_shared.py new file mode 100644 index 0000000..1883e2b --- /dev/null +++ b/integrations/_shared.py @@ -0,0 +1,75 @@ +"""Shared prompt template and JSON schema for all triage backends. + +OpenAI, Claude, and Gemini all use JSON Schema for structured output; SCHEMA is the +single source of truth. Cursor has no schema API and uses prompt-only + parse. +""" + +import json +import os + +SCHEMA = { + "type": "object", + "additionalProperties": False, + "properties": { + "week_of": {"type": "string"}, + "notes": {"type": "string"}, + "ranked": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": False, + "properties": { + "id": {"type": "string"}, + "title": {"type": "string"}, + "link": {"type": "string"}, + "source": {"type": "string"}, + "published_utc": {"type": ["string", "null"]}, + "score": {"type": "number"}, + "why": {"type": "string"}, + "tags": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["id", "title", "link", "source", "published_utc", "score", "why", "tags"], + }, + }, + }, + "required": ["week_of", "notes", "ranked"], +} + + +def load_prompt_template(path: str = "prompt.txt") -> str: + if not os.path.exists(path): + raise RuntimeError("prompt.txt not found in repo root") + with open(path, "r", encoding="utf-8") as f: + return f.read() + + +def build_triage_prompt( + interests: dict, items: list[dict], *, summary_max_chars: int = 500 +) -> tuple[str, list[dict]]: + """Build the triage prompt and lean items. Returns (prompt_string, lean_items).""" + lean_items = [ + { + "id": it["id"], + "source": it["source"], + "title": it["title"], + "link": it["link"], + "published_utc": it.get("published_utc"), + "summary": (it.get("summary") or "")[:summary_max_chars], + } + for it in items + ] + template = load_prompt_template() + prompt = ( + template.replace("{{KEYWORDS}}", json.dumps(interests["keywords"], ensure_ascii=False)) + .replace("{{NARRATIVE}}", interests["narrative"]) + .replace("{{ITEMS}}", json.dumps(lean_items, ensure_ascii=False)) + ) + return (prompt, lean_items) + + +def parse_structured_response(response_text: str) -> dict: + """Parse JSON from a structured-output response; validate 'ranked' exists.""" + data = json.loads(response_text) + if not isinstance(data, dict) or "ranked" not in data: + raise ValueError("Response missing required 'ranked' field") + return data diff --git a/integrations/cursor_cli.py b/integrations/cursor_cli.py new file mode 100644 index 0000000..518c13a --- /dev/null +++ b/integrations/cursor_cli.py @@ -0,0 +1,50 @@ +"""Cursor CLI triage backend. Needs CURSOR_API_KEY and `agent` on PATH.""" + +import json +import os +import subprocess +import time + +from integrations._shared import build_triage_prompt, parse_structured_response + +SUMMARY_MAX_CHARS = int(os.getenv("SUMMARY_MAX_CHARS", "500")) + +# Must match SCHEMA in _shared (Cursor has no structured-output API) +CURSOR_PROMPT_SUFFIX = """ + +Return **only** a single JSON object, no markdown code fences, no commentary. Schema: +{"week_of": "", "notes": "", "ranked": [{"id": "", "title": "", "link": "", "source": "", "published_utc": "", "score": <0-1>, "why": "", "tags": [""]}]} +""" + + +def is_available() -> bool: + return bool(os.environ.get("CURSOR_API_KEY", "").strip()) + + +def call_cursor_triage(interests: dict, items: list[dict]) -> dict: + prompt, _ = build_triage_prompt( + interests, items, summary_max_chars=SUMMARY_MAX_CHARS + ) + prompt = prompt + CURSOR_PROMPT_SUFFIX + args = ["agent", "-p", "--output-format", "text", "--trust", prompt] + last = None + for attempt in range(2): + try: + result = subprocess.run( + args, capture_output=True, text=True, env=os.environ + ) + if result.returncode != 0: + raise RuntimeError( + f"cursor CLI exit {result.returncode}: {result.stderr or result.stdout or 'no output'}" + ) + response_text = (result.stdout or "").strip() + start = response_text.find("{") + end = response_text.rfind("}") + 1 + if start < 0 or end <= start: + raise ValueError("No JSON object found in Cursor output") + return parse_structured_response(response_text[start:end]) + except (ValueError, json.JSONDecodeError, RuntimeError) as e: + last = e + if attempt == 0: + time.sleep(3) + raise last diff --git a/integrations/openai_triage.py b/integrations/openai_triage.py new file mode 100644 index 0000000..1e44485 --- /dev/null +++ b/integrations/openai_triage.py @@ -0,0 +1,43 @@ +"""OpenAI triage backend. Needs OPENAI_API_KEY. Model via OPENAI_MODEL env.""" + +import os +import time + +import httpx +from openai import OpenAI, APITimeoutError, APIConnectionError, RateLimitError + +from integrations._shared import SCHEMA, build_triage_prompt, parse_structured_response + +SUMMARY_MAX_CHARS = int(os.getenv("SUMMARY_MAX_CHARS", "500")) + + +def make_openai_client() -> OpenAI: + key = os.environ.get("OPENAI_API_KEY", "").strip() + if not key.startswith("sk-"): + raise RuntimeError("OPENAI_API_KEY missing/invalid (expected to start with 'sk-').") + http_client = httpx.Client( + timeout=httpx.Timeout(connect=30.0, read=300.0, write=30.0, pool=30.0), + http2=False, + trust_env=False, + headers={"Connection": "close", "Accept-Encoding": "gzip"}, + ) + return OpenAI(api_key=key, http_client=http_client) + + +def call_openai_triage(client: OpenAI, interests: dict, items: list[dict]) -> dict: + model = os.getenv("OPENAI_MODEL", "").strip() or "gpt-4o" + prompt, _ = build_triage_prompt(interests, items, summary_max_chars=SUMMARY_MAX_CHARS) + + last = None + for attempt in range(6): + try: + resp = client.responses.create( + model=model, + input=prompt, + text={"format": {"type": "json_schema", "name": "weekly_toc_digest", "schema": SCHEMA, "strict": True}}, + ) + return parse_structured_response(resp.output_text) + except (APITimeoutError, APIConnectionError, RateLimitError) as e: + last = e + time.sleep(min(60, 2 ** attempt)) + raise last diff --git a/requirements.txt b/requirements.txt index 25acdaf..5842168 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ -openai>=1.0.0 feedparser>=6.0.0 -python-dateutil>=2.9.0 \ No newline at end of file +python-dateutil>=2.9.0 +python-dotenv>=1.0.0 +openai>=1.0.0 +httpx>=0.27.0 \ No newline at end of file