From a3777c0ed4a331322c376739a715deb6ff0c9eab Mon Sep 17 00:00:00 2001 From: Wintermute Date: Mon, 4 May 2026 19:54:16 +0000 Subject: [PATCH 01/15] =?UTF-8?q?feat:=20recency=20boost=20for=20search=20?= =?UTF-8?q?(v0.27.0)=20=E2=80=94=20temporal=20intent=20auto-detection,=20d?= =?UTF-8?q?ate=20filters,=20configurable=20decay?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New search pipeline stage: keyword + vector → RRF → cosine re-score → backlink boost → recency boost → dedup - applyRecencyBoost: hyperbolic decay, two strengths (moderate 30-day halflife, aggressive 7-day halflife) - Auto-enabled when intent.ts detects temporal/event queries (detail='high') - Manual override via SearchOpts.recencyBoost (0/1/2) - Date filtering: afterDate/beforeDate on all three search paths (keyword, keywordChunks, vector) - getPageTimestamps on both Postgres and PGLite engines - 15 tests passing (boost math + intent classification) --- src/core/engine.ts | 6 ++ src/core/pglite-engine.ts | 10 +++ src/core/postgres-engine.ts | 49 +++++++++++++ src/core/search/hybrid.ts | 18 +++++ src/core/search/recency.ts | 68 ++++++++++++++++++ src/core/types.ts | 6 ++ test/recency-boost.test.ts | 138 ++++++++++++++++++++++++++++++++++++ 7 files changed, 295 insertions(+) create mode 100644 src/core/search/recency.ts create mode 100644 test/recency-boost.test.ts diff --git a/src/core/engine.ts b/src/core/engine.ts index f2fe9bf40..7f11c6dc1 100644 --- a/src/core/engine.ts +++ b/src/core/engine.ts @@ -377,6 +377,12 @@ export interface BrainEngine { * Slugs with zero inbound links are present in the map with value 0. */ getBacklinkCounts(slugs: string[]): Promise>; + /** + * v0.27.0: for a list of slugs, return their updated_at timestamps (or created_at fallback). + * Used by hybrid search recency boost. Single SQL query, not N+1. + * Slugs with no timestamp get no entry in the map. + */ + getPageTimestamps(slugs: string[]): Promise>; /** * Return every page with no inbound links (from any source). * Domain comes from the frontmatter `domain` field (null if unset). diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index 4a68bd517..d01365895 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -1208,6 +1208,16 @@ export class PGLiteEngine implements BrainEngine { return result; } + async getPageTimestamps(slugs: string[]): Promise> { + if (slugs.length === 0) return new Map(); + const { rows } = await this.db.query( + `SELECT slug, COALESCE(updated_at, created_at) as ts + FROM pages WHERE slug = ANY($1::text[])`, + [slugs] + ); + return new Map(rows.map((r: any) => [r.slug as string, new Date(r.ts as string)])); + } + async findOrphanPages(): Promise> { const { rows } = await this.db.query( `SELECT diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index 84599fc21..b06301e70 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -521,6 +521,17 @@ export class PostgresEngine implements BrainEngine { params.push(symbolKind); symbolKindClause = `AND cc.symbol_type = $${params.length}`; } + // v0.27.0: date filtering support + let afterDateClause = ''; + if (opts?.afterDate) { + params.push(opts.afterDate); + afterDateClause = `AND COALESCE(p.updated_at, p.created_at) > $${params.length}::timestamptz`; + } + let beforeDateClause = ''; + if (opts?.beforeDate) { + params.push(opts.beforeDate); + beforeDateClause = `AND COALESCE(p.updated_at, p.created_at) < $${params.length}::timestamptz`; + } params.push(innerLimit); const innerLimitParam = `$${params.length}`; params.push(limit); @@ -549,6 +560,8 @@ export class PostgresEngine implements BrainEngine { ${detailLow ? `AND cc.chunk_source = 'compiled_truth'` : ''} ${languageClause} ${symbolKindClause} + ${afterDateClause} + ${beforeDateClause} ${hardExcludeClause} ${visibilityClause} ORDER BY score DESC @@ -630,6 +643,17 @@ export class PostgresEngine implements BrainEngine { params.push(symbolKind); symbolKindClause = `AND cc.symbol_type = $${params.length}`; } + // v0.27.0: date filtering support + let afterDateClause = ''; + if (opts?.afterDate) { + params.push(opts.afterDate); + afterDateClause = `AND COALESCE(p.updated_at, p.created_at) > $${params.length}::timestamptz`; + } + let beforeDateClause = ''; + if (opts?.beforeDate) { + params.push(opts.beforeDate); + beforeDateClause = `AND COALESCE(p.updated_at, p.created_at) < $${params.length}::timestamptz`; + } params.push(limit); const limitParam = `$${params.length}`; params.push(offset); @@ -653,6 +677,8 @@ export class PostgresEngine implements BrainEngine { ${detailLow ? `AND cc.chunk_source = 'compiled_truth'` : ''} ${languageClause} ${symbolKindClause} + ${afterDateClause} + ${beforeDateClause} ${hardExcludeClause} ${visibilityClause} ORDER BY score DESC @@ -718,6 +744,17 @@ export class PostgresEngine implements BrainEngine { params.push(symbolKind); symbolKindClause = `AND cc.symbol_type = $${params.length}`; } + // v0.27.0: date filtering support + let afterDateClause = ''; + if (opts?.afterDate) { + params.push(opts.afterDate); + afterDateClause = `AND COALESCE(p.updated_at, p.created_at) > $${params.length}::timestamptz`; + } + let beforeDateClause = ''; + if (opts?.beforeDate) { + params.push(opts.beforeDate); + beforeDateClause = `AND COALESCE(p.updated_at, p.created_at) < $${params.length}::timestamptz`; + } params.push(innerLimit); const innerLimitParam = `$${params.length}`; params.push(limit); @@ -746,6 +783,8 @@ export class PostgresEngine implements BrainEngine { ${excludeSlugsClause} ${languageClause} ${symbolKindClause} + ${afterDateClause} + ${beforeDateClause} ${hardExcludeClause} ${visibilityClause} ORDER BY cc.embedding <=> $1::vector @@ -1253,6 +1292,16 @@ export class PostgresEngine implements BrainEngine { return result; } + async getPageTimestamps(slugs: string[]): Promise> { + if (slugs.length === 0) return new Map(); + const sql = this.sql; + const rows = await sql` + SELECT slug, COALESCE(updated_at, created_at) as ts + FROM pages WHERE slug = ANY(${slugs}::text[]) + `; + return new Map(rows.map(r => [r.slug as string, new Date(r.ts as string)])); + } + async findOrphanPages(): Promise> { const sql = this.sql; const rows = await sql` diff --git a/src/core/search/hybrid.ts b/src/core/search/hybrid.ts index f58f95415..174fc68ae 100644 --- a/src/core/search/hybrid.ts +++ b/src/core/search/hybrid.ts @@ -16,6 +16,7 @@ import { embed } from '../embedding.ts'; import { dedupResults } from './dedup.ts'; import { autoDetectDetail } from './intent.ts'; import { expandAnchors, hydrateChunks } from './two-pass.ts'; +import { applyRecencyBoost } from './recency.ts'; const RRF_K = 60; const COMPILED_TRUTH_BOOST = 2.0; @@ -231,6 +232,23 @@ export async function hybridSearch( } } + // v0.27.0: recency boost — applied after backlink boost, before dedup. + // Auto-enabled when intent is temporal/event (detail='high'), or when + // opts.recencyBoost is explicitly set. Strength 1 = moderate (30-day + // halflife), 2 = aggressive (7-day halflife). Connection to intent.ts: + // temporal/event queries → detail='high' → recencyStrength=1 here. + const recencyStrength = opts?.recencyBoost ?? (detail === 'high' ? 1 : 0); + if (recencyStrength > 0 && fused.length > 0) { + try { + const recencySlugs = Array.from(new Set(fused.map(r => r.slug))); + const timestamps = await engine.getPageTimestamps(recencySlugs); + applyRecencyBoost(fused, timestamps, recencyStrength as 1 | 2); + fused.sort((a, b) => b.score - a.score); + } catch { + // Recency boost failure is non-fatal: keep existing ranking. + } + } + // Dedup const deduped = dedupResults(fused, dedupOpts); diff --git a/src/core/search/recency.ts b/src/core/search/recency.ts new file mode 100644 index 000000000..9a0062f2f --- /dev/null +++ b/src/core/search/recency.ts @@ -0,0 +1,68 @@ +/** + * Recency Boost for Search Results (v0.27.0) + * + * Applies a time-decay boost to search results so newer pages rank higher. + * Uses a hyperbolic decay curve — recent pages get a meaningful boost, + * but old pages aren't completely buried. + * + * Boost formula: score *= (1 + coefficient / (1 + days_old / halflife)) + * + * At halflife days old, the boost is halved. + * strength=1: halflife=30 days, coefficient=1.0 (moderate — temporal queries) + * strength=2: halflife=7 days, coefficient=1.5 (aggressive — "what's new" queries) + * + * Brand-new page at strength=1: factor = 1 + 1.0 / (1 + 0/30) = 2.0x + * 30-day-old page at strength=1: factor = 1 + 1.0 / (1 + 1) = 1.5x + * 365-day-old page at strength=1: factor = 1 + 1.0 / (1 + 12.17) = ~1.076x + * + * Brand-new page at strength=2: factor = 1 + 1.5 / (1 + 0/7) = 2.5x + * 7-day-old page at strength=2: factor = 1 + 1.5 / (1 + 1) = 1.75x + * 365-day-old page at strength=2: factor = 1 + 1.5 / (1 + 52.14) = ~1.028x + * + * Same contract as applyBacklinkBoost: mutates results in place, caller re-sorts. + */ + +import type { SearchResult } from '../types.ts'; + +const DEBUG = process.env.GBRAIN_SEARCH_DEBUG === '1'; + +interface RecencyConfig { + halflifeDays: number; + coefficient: number; +} + +const STRENGTH_CONFIG: Record<1 | 2, RecencyConfig> = { + 1: { halflifeDays: 30, coefficient: 1.0 }, + 2: { halflifeDays: 7, coefficient: 1.5 }, +}; + +/** + * Apply recency boost to a result list in place. Mutates each result's score + * by (1 + coefficient / (1 + days_old / halflife)). Pure data transform; no DB call. + * Caller fetches timestamps via engine.getPageTimestamps. + */ +export function applyRecencyBoost( + results: SearchResult[], + pageTimestamps: Map, + strength: 1 | 2, +): void { + const config = STRENGTH_CONFIG[strength]; + const now = Date.now(); + + for (const r of results) { + const ts = pageTimestamps.get(r.slug); + if (!ts) continue; // no timestamp → no boost (factor = 1.0) + + const msOld = now - ts.getTime(); + const daysOld = Math.max(0, msOld / (1000 * 60 * 60 * 24)); + const factor = 1.0 + config.coefficient / (1.0 + daysOld / config.halflifeDays); + + if (DEBUG) { + console.error( + `[search-debug] recency: ${r.slug} days_old=${daysOld.toFixed(1)} factor=${factor.toFixed(4)} strength=${strength} score=${r.score.toFixed(4)}→${(r.score * factor).toFixed(4)}`, + ); + } + + r.score *= factor; + } +} diff --git a/src/core/types.ts b/src/core/types.ts index dc3d12c3f..fa666b406 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -304,6 +304,12 @@ export interface SearchOpts { * undefined to search all sources. */ sourceId?: string; + /** v0.27.0: filter results to pages updated/created after this date. ISO-8601 string. */ + afterDate?: string; + /** v0.27.0: filter results to pages updated/created before this date. ISO-8601 string. */ + beforeDate?: string; + /** v0.27.0: recency boost strength. 0 = off, 1 = moderate, 2 = aggressive. Default: auto-detected from intent. */ + recencyBoost?: 0 | 1 | 2; } /** diff --git a/test/recency-boost.test.ts b/test/recency-boost.test.ts new file mode 100644 index 000000000..518e452c9 --- /dev/null +++ b/test/recency-boost.test.ts @@ -0,0 +1,138 @@ +import { describe, it, expect } from 'bun:test'; +import { applyRecencyBoost } from '../src/core/search/recency.ts'; +import type { SearchResult } from '../src/core/types.ts'; + +function makeResult(slug: string, score: number): SearchResult { + return { + slug, + page_id: 1, + title: slug, + type: 'concept' as any, + chunk_text: 'test', + chunk_source: 'compiled_truth', + chunk_id: 1, + chunk_index: 0, + score, + stale: false, + }; +} + +function daysAgo(days: number): Date { + return new Date(Date.now() - days * 24 * 60 * 60 * 1000); +} + +describe('applyRecencyBoost', () => { + it('brand-new page gets max boost at strength=1 (~2.0x)', () => { + const results = [makeResult('new-page', 1.0)]; + const timestamps = new Map([['new-page', new Date()]]); + applyRecencyBoost(results, timestamps, 1); + // factor = 1 + 1.0 / (1 + 0/30) = 2.0 + expect(results[0].score).toBeCloseTo(2.0, 1); + }); + + it('brand-new page gets max boost at strength=2 (~2.5x)', () => { + const results = [makeResult('new-page', 1.0)]; + const timestamps = new Map([['new-page', new Date()]]); + applyRecencyBoost(results, timestamps, 2); + // factor = 1 + 1.5 / (1 + 0/7) = 2.5 + expect(results[0].score).toBeCloseTo(2.5, 1); + }); + + it('30-day-old page gets ~half boost at strength=1 (~1.5x)', () => { + const results = [makeResult('old-page', 1.0)]; + const timestamps = new Map([['old-page', daysAgo(30)]]); + applyRecencyBoost(results, timestamps, 1); + // factor = 1 + 1.0 / (1 + 30/30) = 1 + 1/2 = 1.5 + expect(results[0].score).toBeCloseTo(1.5, 1); + }); + + it('365-day-old page gets minimal boost at strength=1', () => { + const results = [makeResult('ancient', 1.0)]; + const timestamps = new Map([['ancient', daysAgo(365)]]); + applyRecencyBoost(results, timestamps, 1); + // factor = 1 + 1.0 / (1 + 365/30) ≈ 1.076 + expect(results[0].score).toBeGreaterThan(1.0); + expect(results[0].score).toBeLessThan(1.1); + }); + + it('strength=2 decays faster than strength=1', () => { + const r1 = [makeResult('page', 1.0)]; + const r2 = [makeResult('page', 1.0)]; + const timestamps = new Map([['page', daysAgo(14)]]); + applyRecencyBoost(r1, timestamps, 1); + applyRecencyBoost(r2, timestamps, 2); + // At 14 days: strength=1 factor = 1 + 1/(1+14/30) ≈ 1.68 + // At 14 days: strength=2 factor = 1 + 1.5/(1+14/7) = 1 + 1.5/3 = 1.5 + // strength=2 has already decayed more at 14 days + expect(r1[0].score).toBeGreaterThan(r2[0].score); + }); + + it('page with no timestamp gets no boost (score unchanged)', () => { + const results = [makeResult('no-ts', 0.75)]; + const timestamps = new Map(); // empty + applyRecencyBoost(results, timestamps, 1); + expect(results[0].score).toBe(0.75); + }); + + it('empty results array is a no-op', () => { + const results: SearchResult[] = []; + const timestamps = new Map(); + applyRecencyBoost(results, timestamps, 1); + expect(results).toHaveLength(0); + }); + + it('mutates results in place (same contract as backlink boost)', () => { + const result = makeResult('test', 1.0); + const results = [result]; + const timestamps = new Map([['test', new Date()]]); + applyRecencyBoost(results, timestamps, 1); + // Same object reference, mutated score + expect(results[0]).toBe(result); + expect(result.score).toBeGreaterThan(1.0); + }); + + it('multiple results get independent boosts', () => { + const results = [ + makeResult('new', 1.0), + makeResult('medium', 1.0), + makeResult('old', 1.0), + ]; + const timestamps = new Map([ + ['new', daysAgo(0)], + ['medium', daysAgo(30)], + ['old', daysAgo(365)], + ]); + applyRecencyBoost(results, timestamps, 1); + expect(results[0].score).toBeGreaterThan(results[1].score); + expect(results[1].score).toBeGreaterThan(results[2].score); + }); +}); + +// Intent detection tests (recency is auto-triggered by temporal intent) +import { classifyQueryIntent } from '../src/core/search/intent.ts'; + +describe('intent classification → recency triggering', () => { + it('"what\'s new with Ollama" → temporal (triggers recency)', () => { + expect(classifyQueryIntent("what's new with Ollama")).toBe('temporal'); + }); + + it('"recent updates on X" → temporal (triggers recency)', () => { + expect(classifyQueryIntent('recent updates on X')).toBe('temporal'); + }); + + it('"latest on YC Labs" → temporal (triggers recency)', () => { + expect(classifyQueryIntent('latest on YC Labs')).toBe('temporal'); + }); + + it('"who is Garry Tan" → entity (no recency)', () => { + expect(classifyQueryIntent('who is Garry Tan')).toBe('entity'); + }); + + it('"tell me about Ollama" → entity (no recency)', () => { + expect(classifyQueryIntent('tell me about Ollama')).toBe('entity'); + }); + + it('"Ollama" (bare name) → general (no recency)', () => { + expect(classifyQueryIntent('Ollama')).toBe('general'); + }); +}); From 421ec75e1068bad54c1ff00909b67628d566a444 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 11:06:35 -0700 Subject: [PATCH 02/15] v0.29.1 schema: pages.{effective_date, effective_date_source, import_filename, salience_touched_at} + expression index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migration v38 adds 4 nullable columns to pages and an expression index on COALESCE(effective_date, updated_at) to support the new since/until date filters. All additive — no behavior change in the default search path; only consulted when callers opt into the new salience='on' / recency='on' axes or pass since/until. effective_date — content date (event_date / date / published / filename-date / fallback). Read by recency boost and date-filter paths only. Auto-link doesn't touch it (immune to updated_at churn). effective_date_source — sentinel for the doctor's effective_date_health check ('event_date' | 'date' | 'published' | 'filename' | 'fallback'). import_filename — basename without extension, captured at import. Used for filename-date precedence on daily/, meetings/. Older rows leave it NULL. salience_touched_at — bumped by recompute_emotional_weight when emotional_weight changes. Salience window uses GREATEST(updated_at, salience_touched_at) so newly-salient old pages enter the recent salience query. Index strategy: a partial index on effective_date alone wouldn't help the COALESCE expression in since/until filters (planner can't use it for the negative side). The expression index ((COALESCE(effective_date, updated_at))) is what actually accelerates the filter. Postgres uses CONCURRENTLY + v14-style pg_index.indisvalid pre-drop guard for prior failed CONCURRENTLY runs; PGLite uses plain CREATE INDEX. Mirror of v34's pattern. src/schema.sql + src/core/pglite-schema.ts updated for fresh installs; src/core/schema-embedded.ts regenerated via bun run build:schema. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/migrate.ts | 75 +++++++++++++++++++++++++++++++++++++ src/core/pglite-schema.ts | 8 ++++ src/core/schema-embedded.ts | 17 +++++++++ src/schema.sql | 17 +++++++++ 4 files changed, 117 insertions(+) diff --git a/src/core/migrate.ts b/src/core/migrate.ts index 9e4e97228..7f46ebeaf 100644 --- a/src/core/migrate.ts +++ b/src/core/migrate.ts @@ -1576,6 +1576,81 @@ export const MIGRATIONS: Migration[] = [ ADD COLUMN IF NOT EXISTS emotional_weight REAL NOT NULL DEFAULT 0.0; `, }, + { + version: 38, + name: 'pages_recency_columns', + sql: '', + // v0.29.1 — Salience-and-Recency, additive opt-in. + // + // Four new pages columns (all nullable, additive only, no behavior change + // in the default search path; only consulted when a caller opts into + // `salience='on'` / `recency='on'` or the new `since`/`until` filter): + // + // effective_date — content date (event_date / date / published / + // filename-date / fallback). Read by the new + // recency boost and date-filter paths only. + // Auto-link doesn't touch it (immune to + // updated_at churn). + // effective_date_source — sentinel for the doctor's effective_date_health + // check ('event_date' | 'date' | 'published' | + // 'filename' | 'fallback'). The 'fallback' value + // is what surfaces "page that fell back to + // updated_at when frontmatter was unparseable". + // import_filename — basename without extension, captured at import. + // computeEffectiveDate uses it for filename-date + // precedence (daily/, meetings/ prefixes). Older + // rows leave it NULL; backfill falls through. + // salience_touched_at — bumped by recompute_emotional_weight when + // emotional_weight changes. Salience window + // uses GREATEST(updated_at, salience_touched_at) + // so newly-salient old pages enter the recent + // salience query. + // + // Plus an expression index used by since/until filters that read + // COALESCE(effective_date, updated_at). Partial-index claim from earlier + // plan iterations was wrong (codex pass-2 #15) — the planner won't use a + // partial index for the negative side of a COALESCE; expression index does. + // + // CONCURRENTLY + pre-drop guard (mirror of v34) on Postgres; plain CREATE + // INDEX on PGLite via the handler branching on engine.kind. + handler: async (engine) => { + // 1. ADD COLUMN x4. ALTER TABLE ADD COLUMN IF NOT EXISTS is idempotent. + // No defaults, all nullable, all metadata-only on PG 11+ and PGLite. + await engine.runMigration(38, ` + ALTER TABLE pages ADD COLUMN IF NOT EXISTS effective_date TIMESTAMPTZ; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS effective_date_source TEXT; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS import_filename TEXT; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS salience_touched_at TIMESTAMPTZ; + `); + + // 2. Expression index for since/until date-range filters. + if (engine.kind === 'postgres') { + // Pre-drop any invalid index from a prior CONCURRENTLY failure. + await engine.runMigration(38, ` + DO $$ BEGIN + IF EXISTS ( + SELECT 1 FROM pg_index i + JOIN pg_class c ON c.oid = i.indexrelid + WHERE c.relname = 'pages_coalesce_date_idx' AND NOT i.indisvalid + ) THEN + EXECUTE 'DROP INDEX CONCURRENTLY IF EXISTS pages_coalesce_date_idx'; + END IF; + END $$; + `); + await engine.runMigration(38, ` + CREATE INDEX CONCURRENTLY IF NOT EXISTS pages_coalesce_date_idx + ON pages ((COALESCE(effective_date, updated_at))); + `); + } else { + await engine.runMigration(38, ` + CREATE INDEX IF NOT EXISTS pages_coalesce_date_idx + ON pages ((COALESCE(effective_date, updated_at))); + `); + } + }, + // CONCURRENTLY on Postgres requires no surrounding transaction. + transaction: false, + }, ]; export const LATEST_VERSION = MIGRATIONS.length > 0 diff --git a/src/core/pglite-schema.ts b/src/core/pglite-schema.ts index e79da18a1..8fe05f8e9 100644 --- a/src/core/pglite-schema.ts +++ b/src/core/pglite-schema.ts @@ -69,6 +69,11 @@ CREATE TABLE IF NOT EXISTS pages ( updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), -- v0.26.5: soft-delete + recovery window (mirrors src/schema.sql). deleted_at TIMESTAMPTZ, + -- v0.29.1: salience-and-recency, additive opt-in (mirrors src/schema.sql). + effective_date TIMESTAMPTZ, + effective_date_source TEXT, + import_filename TEXT, + salience_touched_at TIMESTAMPTZ, CONSTRAINT pages_source_slug_key UNIQUE (source_id, slug) ); @@ -79,6 +84,9 @@ CREATE INDEX IF NOT EXISTS idx_pages_source_id ON pages(source_id); -- v0.26.5: partial index supports the autopilot purge sweep (mirrors src/schema.sql). CREATE INDEX IF NOT EXISTS pages_deleted_at_purge_idx ON pages (deleted_at) WHERE deleted_at IS NOT NULL; +-- v0.29.1: expression index for since/until date-range filters. +CREATE INDEX IF NOT EXISTS pages_coalesce_date_idx + ON pages ((COALESCE(effective_date, updated_at))); -- ============================================================ -- content_chunks: chunked content with embeddings diff --git a/src/core/schema-embedded.ts b/src/core/schema-embedded.ts index 20af92530..565431e4c 100644 --- a/src/core/schema-embedded.ts +++ b/src/core/schema-embedded.ts @@ -92,6 +92,17 @@ CREATE TABLE IF NOT EXISTS pages ( -- where deleted_at < now() - 72h. Search and \`get_page\` filter -- \`WHERE deleted_at IS NULL\` by default; \`include_deleted: true\` opts in. deleted_at TIMESTAMPTZ, + -- v0.29.1: salience-and-recency, additive opt-in. All NULL by default; + -- only consulted when a caller passes \`salience='on'\` / \`recency='on'\` or + -- the new \`since\`/\`until\` filter. effective_date_source is a sentinel for + -- the doctor's effective_date_health check (values: 'event_date' | 'date' + -- | 'published' | 'filename' | 'fallback'). salience_touched_at is bumped + -- by recompute_emotional_weight when emotional_weight changes so the + -- salience window picks up newly-salient old pages. + effective_date TIMESTAMPTZ, + effective_date_source TEXT, + import_filename TEXT, + salience_touched_at TIMESTAMPTZ, CONSTRAINT pages_source_slug_key UNIQUE (source_id, slug) ); @@ -109,6 +120,12 @@ CREATE INDEX IF NOT EXISTS idx_pages_source_id ON pages(source_id); -- stays low. Don't add a regular \`(deleted_at)\` index without measuring. CREATE INDEX IF NOT EXISTS pages_deleted_at_purge_idx ON pages (deleted_at) WHERE deleted_at IS NOT NULL; +-- v0.29.1: expression index used by since/until date-range filters that read +-- COALESCE(effective_date, updated_at). A partial index on effective_date +-- alone would NOT help — the planner can't use it for the negative side of +-- the COALESCE. Expression index is what actually accelerates the filter. +CREATE INDEX IF NOT EXISTS pages_coalesce_date_idx + ON pages ((COALESCE(effective_date, updated_at))); -- ============================================================ -- content_chunks: chunked content with embeddings diff --git a/src/schema.sql b/src/schema.sql index bffc853cc..9da0cf195 100644 --- a/src/schema.sql +++ b/src/schema.sql @@ -88,6 +88,17 @@ CREATE TABLE IF NOT EXISTS pages ( -- where deleted_at < now() - 72h. Search and `get_page` filter -- `WHERE deleted_at IS NULL` by default; `include_deleted: true` opts in. deleted_at TIMESTAMPTZ, + -- v0.29.1: salience-and-recency, additive opt-in. All NULL by default; + -- only consulted when a caller passes `salience='on'` / `recency='on'` or + -- the new `since`/`until` filter. effective_date_source is a sentinel for + -- the doctor's effective_date_health check (values: 'event_date' | 'date' + -- | 'published' | 'filename' | 'fallback'). salience_touched_at is bumped + -- by recompute_emotional_weight when emotional_weight changes so the + -- salience window picks up newly-salient old pages. + effective_date TIMESTAMPTZ, + effective_date_source TEXT, + import_filename TEXT, + salience_touched_at TIMESTAMPTZ, CONSTRAINT pages_source_slug_key UNIQUE (source_id, slug) ); @@ -105,6 +116,12 @@ CREATE INDEX IF NOT EXISTS idx_pages_source_id ON pages(source_id); -- stays low. Don't add a regular `(deleted_at)` index without measuring. CREATE INDEX IF NOT EXISTS pages_deleted_at_purge_idx ON pages (deleted_at) WHERE deleted_at IS NOT NULL; +-- v0.29.1: expression index used by since/until date-range filters that read +-- COALESCE(effective_date, updated_at). A partial index on effective_date +-- alone would NOT help — the planner can't use it for the negative side of +-- the COALESCE. Expression index is what actually accelerates the filter. +CREATE INDEX IF NOT EXISTS pages_coalesce_date_idx + ON pages ((COALESCE(effective_date, updated_at))); -- ============================================================ -- content_chunks: chunked content with embeddings From ea63fa15c4d0e7159b79210812f01c73ec86a2aa Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 11:11:28 -0700 Subject: [PATCH 03/15] v0.29.1: computeEffectiveDate helper + putPage integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure helper computing a page's effective_date from frontmatter precedence: 1. event_date (meeting/event pages) 2. date (dated essays) 3. published (writing/) 4. filename-date (leading YYYY-MM-DD in basename) 5. updated_at (fallback) 6. created_at (last resort) Per-prefix override: for daily/ and meetings/ slugs, filename-date jumps to position 1 — the filename is the user's primary signal there. Returns {date, source}. The source label powers the doctor's effective_date_health check to detect "fell back to updated_at" rows that look populated but are functionally a NULL. Range validation: parsed value must be in [1990-01-01, NOW + 1 year]. Out-of-range values drop to the next chain element. Wired into importFromContent + importFromFile. The put_page MCP op derives filename from slug-tail when no caller-supplied filename is available. putPage SQL on both engines extended to write the new columns. ON CONFLICT uses COALESCE(EXCLUDED.x, pages.x) so callers that don't know about the new columns (auto-link, code reindex) preserve existing values rather than blanking them. SELECT projection extended to return them; rowToPage threads them through. 21 unit tests covering: precedence chain default order, per-prefix override, parse failure fall-through, range validation [1990, NOW+1y], parseDateLoose shape variants. All pass; typecheck clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/effective-date.ts | 158 +++++++++++++++++++++++++++++++++ src/core/import-file.ts | 37 +++++++- src/core/pglite-engine.ts | 28 +++--- src/core/postgres-engine.ts | 22 +++-- src/core/types.ts | 47 ++++++++++ src/core/utils.ts | 27 ++++-- test/effective-date.test.ts | 170 ++++++++++++++++++++++++++++++++++++ 7 files changed, 464 insertions(+), 25 deletions(-) create mode 100644 src/core/effective-date.ts create mode 100644 test/effective-date.test.ts diff --git a/src/core/effective-date.ts b/src/core/effective-date.ts new file mode 100644 index 000000000..825413928 --- /dev/null +++ b/src/core/effective-date.ts @@ -0,0 +1,158 @@ +/** + * v0.29.1 — Compute a page's effective_date from frontmatter precedence. + * + * The "effective date" is the answer to "when was this page about?" It's + * NOT updated_at (which churns from auto-link) and NOT created_at (which + * is the row insert time). It's the user's stated content date. + * + * Precedence chain (default order): + * 1. frontmatter.event_date — meeting / event pages + * 2. frontmatter.date — dated essays + * 3. frontmatter.published — writing/ + * 4. filename-date — leading YYYY-MM-DD in basename + * 5. updated_at — fallback + * 6. created_at — last resort (only if updated_at NULL) + * + * Per-prefix override: for `daily/` and `meetings/` slug prefixes, the + * filename-date jumps to position 1 — the filename is the user's primary + * signal there ("daily/2024-03-15.md" the FILE date matters more than any + * frontmatter the user pasted). + * + * Returns BOTH the parsed Date and the source label so the doctor's + * `effective_date_health` check can detect "fell back to updated_at" rows + * that look populated but are functionally equivalent to a NULL. + * + * Range validation: parsed value must be in [1990-01-01, NOW + 1 year]. + * Out-of-range values are dropped (the chain falls through to the next + * element). NaN / unparseable strings drop the same way. + * + * Pure function. No DB. Tested in test/effective-date.test.ts. + */ + +import type { EffectiveDateSource } from './types.ts'; + +export interface EffectiveDateResult { + date: Date | null; + source: EffectiveDateSource | null; +} + +export interface ComputeEffectiveDateOpts { + slug: string; + frontmatter: Record; + /** Basename without extension, e.g. "2024-03-15-acme-call". May be null/empty. */ + filename?: string | null; + updatedAt: Date; + createdAt: Date; +} + +/** + * Slug prefixes where the filename date wins over frontmatter dates. The + * user's primary signal in these directories is the filename, not arbitrary + * frontmatter the importer might have copied. + * + * Hardcoded in v0.29.1 (commit 2). v0.29.1 commit 5 introduces the + * recency-decay map; we could move this list there if we wanted user-tunable + * filename-first prefixes, but the daily/ + meetings/ defaults are stable + * enough that hardcoding is correct. + */ +const FILENAME_FIRST_PREFIXES = ['daily/', 'meetings/']; + +const MIN_DATE_MS = Date.UTC(1990, 0, 1); +const FILENAME_DATE_RE = /^(\d{4}-\d{2}-\d{2})/; + +function maxDateMs(): number { + // NOW + 1 year, computed at call time so tests with a mocked Date.now() + // see a moving boundary. Pages dated > 1 year in the future are almost + // always corrupt (epoch math gone wrong, typoed century, bad parse). + return Date.now() + 365 * 24 * 60 * 60 * 1000; +} + +/** Parse a frontmatter value as a Date. Accepts Date instances, ISO strings, YYYY-MM-DD. Returns null on any failure. */ +export function parseDateLoose(value: unknown): Date | null { + if (value == null) return null; + if (value instanceof Date) { + return Number.isFinite(value.getTime()) ? value : null; + } + if (typeof value === 'string') { + const trimmed = value.trim(); + if (trimmed === '') return null; + const ms = Date.parse(trimmed); + if (!Number.isFinite(ms)) return null; + return new Date(ms); + } + if (typeof value === 'number') { + // Plausibility: numbers are usually ms since epoch but YAML can yield + // bare integers (year? month? day?) — accept only if the resulting Date + // falls inside the valid window. validateInRange catches the rest. + return Number.isFinite(value) ? new Date(value) : null; + } + return null; +} + +function validateInRange(d: Date | null): Date | null { + if (d === null) return null; + const ms = d.getTime(); + if (!Number.isFinite(ms)) return null; + if (ms < MIN_DATE_MS) return null; + if (ms > maxDateMs()) return null; + return d; +} + +function extractFilenameDate(filename: string | null | undefined): Date | null { + if (!filename) return null; + const m = filename.match(FILENAME_DATE_RE); + if (!m) return null; + return validateInRange(parseDateLoose(m[1])); +} + +function hasFilenameFirstPrefix(slug: string): boolean { + for (const p of FILENAME_FIRST_PREFIXES) { + if (slug.startsWith(p)) return true; + } + return false; +} + +/** + * Run the precedence chain. Returns the first valid (in-range) date and its + * source label. Falls all the way through to updated_at / created_at as + * 'fallback' when nothing in frontmatter or filename parses. + */ +export function computeEffectiveDate(opts: ComputeEffectiveDateOpts): EffectiveDateResult { + const { slug, frontmatter, filename, updatedAt, createdAt } = opts; + const filenameFirst = hasFilenameFirstPrefix(slug); + + const fmEvent = validateInRange(parseDateLoose(frontmatter.event_date)); + const fmDate = validateInRange(parseDateLoose(frontmatter.date)); + const fmPublished = validateInRange(parseDateLoose(frontmatter.published)); + const filenameDate = extractFilenameDate(filename); + + // Build the ordered candidate list. For filename-first prefixes + // (daily/, meetings/) the filename moves to the head of the chain. + const candidates: Array<{ date: Date | null; source: EffectiveDateSource }> = filenameFirst + ? [ + { date: filenameDate, source: 'filename' }, + { date: fmEvent, source: 'event_date' }, + { date: fmDate, source: 'date' }, + { date: fmPublished, source: 'published' }, + ] + : [ + { date: fmEvent, source: 'event_date' }, + { date: fmDate, source: 'date' }, + { date: fmPublished, source: 'published' }, + { date: filenameDate, source: 'filename' }, + ]; + + for (const c of candidates) { + if (c.date !== null) return { date: c.date, source: c.source }; + } + + // Fallback chain: updated_at, then created_at. Both are guaranteed + // non-null by the schema; the validation here is defensive against bad + // test fixtures. + const upd = validateInRange(updatedAt); + if (upd !== null) return { date: upd, source: 'fallback' }; + const cre = validateInRange(createdAt); + if (cre !== null) return { date: cre, source: 'fallback' }; + + return { date: null, source: null }; +} diff --git a/src/core/import-file.ts b/src/core/import-file.ts index ded596afb..b4bd91d52 100644 --- a/src/core/import-file.ts +++ b/src/core/import-file.ts @@ -11,6 +11,7 @@ import { extractCodeRefs } from './link-extraction.ts'; import { embedBatch } from './embedding.ts'; import { slugifyPath, slugifyCodePath, isCodeFilePath } from './sync.ts'; import type { ChunkInput, PageType } from './types.ts'; +import { computeEffectiveDate } from './effective-date.ts'; /** * v0.20.0 Cathedral II Layer 8 D2 — markdown fence extraction helper. @@ -185,7 +186,15 @@ export async function importFromContent( engine: BrainEngine, slug: string, content: string, - opts: { noEmbed?: boolean } = {}, + opts: { + noEmbed?: boolean; + /** + * v0.29.1: basename without extension for filename-date precedence on + * `daily/`, `meetings/` slugs. importFromFile threads this from the + * disk path; the put_page MCP op derives it from the slug tail. + */ + filename?: string; + } = {}, ): Promise { // Reject oversized payloads before any parsing, chunking, or embedding happens. // Uses Buffer.byteLength to count UTF-8 bytes the same way disk size would, @@ -269,6 +278,23 @@ export async function importFromContent( await engine.transaction(async (tx) => { if (existing) await tx.createVersion(slug); + // v0.29.1 — compute effective_date from frontmatter precedence chain. + // Filename comes from importFromFile path (basename) or the slug tail + // (put_page MCP op fallback). updatedAt/createdAt use the existing + // page's timestamps when present; otherwise NOW() (the row about to + // be created). The result drives the recency boost and since/until + // filters when callers opt in; nothing in the default search path + // consults it. + const filenameForChain = opts.filename ?? slug.split('/').pop() ?? slug; + const nowDate = new Date(); + const { date: effectiveDate, source: effectiveDateSource } = computeEffectiveDate({ + slug, + frontmatter: parsed.frontmatter, + filename: filenameForChain, + updatedAt: existing?.updated_at ?? nowDate, + createdAt: existing?.created_at ?? nowDate, + }); + await tx.putPage(slug, { type: parsed.type, title: parsed.title, @@ -276,6 +302,9 @@ export async function importFromContent( timeline: parsed.timeline || '', frontmatter: parsed.frontmatter, content_hash: hash, + effective_date: effectiveDate, + effective_date_source: effectiveDateSource, + import_filename: filenameForChain, }); // Tag reconciliation: remove stale, add current @@ -392,7 +421,11 @@ export async function importFromFile( // Pass the path-derived slug explicitly so that any future change to // parseMarkdown's precedence rules cannot re-introduce this bug. - return importFromContent(engine, expectedSlug, content, opts); + // v0.29.1: thread the basename (without extension) for filename-date + // precedence in computeEffectiveDate. e.g. `daily/2024-03-15.md` → + // filename `2024-03-15`. + const fileBasename = basename(relativePath, '.md'); + return importFromContent(engine, expectedSlug, content, { ...opts, filename: fileBasename }); } /** diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index d01365895..6d031e371 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -382,15 +382,20 @@ export class PGLiteEngine implements BrainEngine { const hash = page.content_hash || contentHash(page); const frontmatter = page.frontmatter || {}; - // v0.18.0 Step 2: source_id relies on the schema DEFAULT 'default' so - // existing callers still target the default source without threading - // a parameter. ON CONFLICT target becomes (source_id, slug) since the - // global UNIQUE(slug) was dropped in migration v17. Step 5+ will - // surface an explicit sourceId param on putPage for multi-source sync. + // v0.18.0 Step 2: source_id relies on the schema DEFAULT 'default'. + // ON CONFLICT target is (source_id, slug); global UNIQUE(slug) dropped in v17. const pageKind = page.page_kind || 'markdown'; + // v0.29.1 — additive opt-in columns. COALESCE(EXCLUDED.x, pages.x) + // preserves existing values when caller omits them (auto-link path, + // code reindex, etc.). Mirrors postgres-engine.ts. + const effectiveDate = page.effective_date instanceof Date + ? page.effective_date.toISOString() + : (page.effective_date ?? null); + const effectiveDateSource = page.effective_date_source ?? null; + const importFilename = page.import_filename ?? null; const { rows } = await this.db.query( - `INSERT INTO pages (slug, type, page_kind, title, compiled_truth, timeline, frontmatter, content_hash, updated_at) - VALUES ($1, $2, $3, $4, $5, $6, $7::jsonb, $8, now()) + `INSERT INTO pages (slug, type, page_kind, title, compiled_truth, timeline, frontmatter, content_hash, updated_at, effective_date, effective_date_source, import_filename) + VALUES ($1, $2, $3, $4, $5, $6, $7::jsonb, $8, now(), $9::timestamptz, $10, $11) ON CONFLICT (source_id, slug) DO UPDATE SET type = EXCLUDED.type, page_kind = EXCLUDED.page_kind, @@ -399,9 +404,12 @@ export class PGLiteEngine implements BrainEngine { timeline = EXCLUDED.timeline, frontmatter = EXCLUDED.frontmatter, content_hash = EXCLUDED.content_hash, - updated_at = now() - RETURNING id, slug, type, title, compiled_truth, timeline, frontmatter, content_hash, created_at, updated_at`, - [slug, page.type, pageKind, page.title, page.compiled_truth, page.timeline || '', JSON.stringify(frontmatter), hash] + updated_at = now(), + effective_date = COALESCE(EXCLUDED.effective_date, pages.effective_date), + effective_date_source = COALESCE(EXCLUDED.effective_date_source, pages.effective_date_source), + import_filename = COALESCE(EXCLUDED.import_filename, pages.import_filename) + RETURNING id, slug, type, title, compiled_truth, timeline, frontmatter, content_hash, created_at, updated_at, effective_date, effective_date_source, import_filename`, + [slug, page.type, pageKind, page.title, page.compiled_truth, page.timeline || '', JSON.stringify(frontmatter), hash, effectiveDate, effectiveDateSource, importFilename] ); return rowToPage(rows[0] as Record); } diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index b06301e70..e0661abc0 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -331,12 +331,19 @@ export class PostgresEngine implements BrainEngine { // v0.18.0 Step 2: source_id relies on schema DEFAULT 'default'. ON // CONFLICT target becomes (source_id, slug) since global UNIQUE(slug) - // was dropped in migration v17. See pglite-engine.ts for matching - // notes; multi-source sync (Step 5) will surface an explicit sourceId. + // was dropped in migration v17. const pageKind = page.page_kind || 'markdown'; + // v0.29.1 — effective_date / effective_date_source / import_filename are + // additive opt-in inputs from the importer (computeEffectiveDate). When + // omitted, the ON CONFLICT path preserves any existing value via + // COALESCE(EXCLUDED.x, pages.x) so a putPage that doesn't know about + // these columns (auto-link, code reindex, etc.) doesn't blank them out. + const effectiveDate = page.effective_date ?? null; + const effectiveDateSource = page.effective_date_source ?? null; + const importFilename = page.import_filename ?? null; const rows = await sql` - INSERT INTO pages (slug, type, page_kind, title, compiled_truth, timeline, frontmatter, content_hash, updated_at) - VALUES (${slug}, ${page.type}, ${pageKind}, ${page.title}, ${page.compiled_truth}, ${page.timeline || ''}, ${sql.json(frontmatter as Parameters[0])}, ${hash}, now()) + INSERT INTO pages (slug, type, page_kind, title, compiled_truth, timeline, frontmatter, content_hash, updated_at, effective_date, effective_date_source, import_filename) + VALUES (${slug}, ${page.type}, ${pageKind}, ${page.title}, ${page.compiled_truth}, ${page.timeline || ''}, ${sql.json(frontmatter as Parameters[0])}, ${hash}, now(), ${effectiveDate}, ${effectiveDateSource}, ${importFilename}) ON CONFLICT (source_id, slug) DO UPDATE SET type = EXCLUDED.type, page_kind = EXCLUDED.page_kind, @@ -345,8 +352,11 @@ export class PostgresEngine implements BrainEngine { timeline = EXCLUDED.timeline, frontmatter = EXCLUDED.frontmatter, content_hash = EXCLUDED.content_hash, - updated_at = now() - RETURNING id, slug, type, title, compiled_truth, timeline, frontmatter, content_hash, created_at, updated_at + updated_at = now(), + effective_date = COALESCE(EXCLUDED.effective_date, pages.effective_date), + effective_date_source = COALESCE(EXCLUDED.effective_date_source, pages.effective_date_source), + import_filename = COALESCE(EXCLUDED.import_filename, pages.import_filename) + RETURNING id, slug, type, title, compiled_truth, timeline, frontmatter, content_hash, created_at, updated_at, effective_date, effective_date_source, import_filename `; return rowToPage(rows[0]); } diff --git a/src/core/types.ts b/src/core/types.ts index fa666b406..801f78d60 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -26,8 +26,44 @@ export interface Page { * The autopilot purge phase hard-deletes rows where `deleted_at < now() - 72h`. */ deleted_at?: Date | null; + /** + * v0.29.1: content date computed from frontmatter precedence chain + * (event_date / date / published / filename / fallback). Populated by + * `computeEffectiveDate`; immune to auto-link updated_at churn. Read by + * the recency boost and since/until filter; nothing in the default search + * path consults it. + */ + effective_date?: Date | null; + /** + * v0.29.1: which precedence step won (`event_date | date | published | + * filename | fallback`). Powers the doctor's `effective_date_health` check + * to detect pages that fell back to updated_at because frontmatter was + * unparseable. + */ + effective_date_source?: EffectiveDateSource | null; + /** + * v0.29.1: basename without extension captured at import (e.g. + * "2024-03-15-acme-call"). Used by computeEffectiveDate for filename-date + * precedence on `daily/` and `meetings/` prefixes. NULL for older rows + * imported pre-v0.29.1. + */ + import_filename?: string | null; + /** + * v0.29.1: bumped by `recompute_emotional_weight` when the page's + * emotional_weight changes. The salience query window uses + * `GREATEST(updated_at, salience_touched_at)` so newly-salient old pages + * surface in `get_recent_salience`. + */ + salience_touched_at?: Date | null; } +export type EffectiveDateSource = + | 'event_date' + | 'date' + | 'published' + | 'filename' + | 'fallback'; + export type PageKind = 'markdown' | 'code'; export interface PageInput { @@ -44,6 +80,17 @@ export interface PageInput { * `query --lang` filtering. */ page_kind?: PageKind; + /** + * v0.29.1: content date from frontmatter precedence (computed by importer + * via `computeEffectiveDate`). When omitted, putPage leaves the column + * unchanged on conflict (preserves any existing value); on insert the + * column is NULL. NULL is fine — recency paths COALESCE to updated_at. + */ + effective_date?: Date | null; + /** v0.29.1: paired with effective_date; NULL when effective_date is NULL. */ + effective_date_source?: EffectiveDateSource | null; + /** v0.29.1: basename without extension captured at import. */ + import_filename?: string | null; } export interface PageFilters { diff --git a/src/core/utils.ts b/src/core/utils.ts index 19a1df4bb..05874ad6c 100644 --- a/src/core/utils.ts +++ b/src/core/utils.ts @@ -43,13 +43,21 @@ export function contentHash(page: PageInput): string { .digest('hex'); } +function readOptionalDate(raw: unknown): Date | null | undefined { + // Three-state read for columns that may or may not be in the SELECT + // projection: undefined (not selected), null (selected, NULL value), + // Date (selected, populated). Mirrors the v0.26.5 deleted_at pattern. + if (raw === undefined) return undefined; + if (raw === null) return null; + return new Date(raw as string); +} + export function rowToPage(row: Record): Page { - // v0.26.5: deleted_at is optional in the SELECT projection. When the column - // isn't selected (legacy callers), keep the field absent on the returned object. - const deletedAtRaw = row.deleted_at; - const deletedAt = deletedAtRaw == null - ? (deletedAtRaw === null ? null : undefined) - : new Date(deletedAtRaw as string); + const deletedAt = readOptionalDate(row.deleted_at); + const effectiveDate = readOptionalDate(row.effective_date); + const salienceTouchedAt = readOptionalDate(row.salience_touched_at); + const effectiveDateSource = row.effective_date_source as Page['effective_date_source'] | undefined; + const importFilename = row.import_filename as string | null | undefined; return { id: row.id as number, slug: row.slug as string, @@ -59,11 +67,16 @@ export function rowToPage(row: Record): Page { timeline: row.timeline as string, frontmatter: (typeof row.frontmatter === 'string' ? JSON.parse(row.frontmatter) : row.frontmatter) as Record, content_hash: row.content_hash as string | undefined, - // v0.29 (column added in migration v34). Old brains pre-migration return undefined. + // v0.29 (column added in migration v37). Old brains pre-migration return undefined. emotional_weight: row.emotional_weight == null ? undefined : Number(row.emotional_weight), created_at: new Date(row.created_at as string), updated_at: new Date(row.updated_at as string), ...(deletedAt !== undefined && { deleted_at: deletedAt }), + // v0.29.1 (columns added in migration v38). Optional in SELECT projection. + ...(effectiveDate !== undefined && { effective_date: effectiveDate }), + ...(effectiveDateSource !== undefined && { effective_date_source: effectiveDateSource }), + ...(importFilename !== undefined && { import_filename: importFilename }), + ...(salienceTouchedAt !== undefined && { salience_touched_at: salienceTouchedAt }), }; } diff --git a/test/effective-date.test.ts b/test/effective-date.test.ts new file mode 100644 index 000000000..7c8c261f9 --- /dev/null +++ b/test/effective-date.test.ts @@ -0,0 +1,170 @@ +/** + * v0.29.1 — Tests for computeEffectiveDate (precedence chain + per-prefix + * override + range validation + parse-failure fall-through). + * + * The function is pure (no DB), so these are fast unit tests. + */ + +import { describe, test, expect } from 'bun:test'; +import { computeEffectiveDate, parseDateLoose } from '../src/core/effective-date.ts'; + +const baseUpdated = new Date('2026-05-04T12:00:00Z'); +const baseCreated = new Date('2026-05-01T12:00:00Z'); + +function run(opts: { + slug?: string; + fm?: Record; + filename?: string | null; + updatedAt?: Date; + createdAt?: Date; +}) { + return computeEffectiveDate({ + slug: opts.slug ?? 'wiki/example', + frontmatter: opts.fm ?? {}, + filename: opts.filename ?? null, + updatedAt: opts.updatedAt ?? baseUpdated, + createdAt: opts.createdAt ?? baseCreated, + }); +} + +describe('parseDateLoose', () => { + test('Date instance passthrough', () => { + const d = new Date('2024-03-15'); + expect(parseDateLoose(d)?.getTime()).toBe(d.getTime()); + }); + test('ISO string parses', () => { + const d = parseDateLoose('2024-03-15T00:00:00Z'); + expect(d?.toISOString()).toBe('2024-03-15T00:00:00.000Z'); + }); + test('YYYY-MM-DD string parses', () => { + const d = parseDateLoose('2024-03-15'); + expect(d?.toISOString().startsWith('2024-03-15')).toBe(true); + }); + test('null/undefined → null', () => { + expect(parseDateLoose(null)).toBeNull(); + expect(parseDateLoose(undefined)).toBeNull(); + }); + test('invalid Date → null', () => { + expect(parseDateLoose(new Date('not a date'))).toBeNull(); + }); + test('unparseable string → null', () => { + expect(parseDateLoose('tomorrow')).toBeNull(); + expect(parseDateLoose('garbage')).toBeNull(); + expect(parseDateLoose('')).toBeNull(); + }); +}); + +describe('computeEffectiveDate precedence chain (default order)', () => { + test('event_date wins when present', () => { + const r = run({ fm: { event_date: '2024-03-15', date: '2024-04-01', published: '2024-05-01' } }); + expect(r.source).toBe('event_date'); + expect(r.date?.toISOString().startsWith('2024-03-15')).toBe(true); + }); + + test('date wins when event_date absent', () => { + const r = run({ fm: { date: '2024-04-01', published: '2024-05-01' } }); + expect(r.source).toBe('date'); + expect(r.date?.toISOString().startsWith('2024-04-01')).toBe(true); + }); + + test('published wins when event_date + date absent', () => { + const r = run({ fm: { published: '2024-05-01' } }); + expect(r.source).toBe('published'); + expect(r.date?.toISOString().startsWith('2024-05-01')).toBe(true); + }); + + test('filename wins when no frontmatter dates', () => { + const r = run({ filename: '2024-06-15-some-meeting' }); + expect(r.source).toBe('filename'); + expect(r.date?.toISOString().startsWith('2024-06-15')).toBe(true); + }); + + test('fallback to updated_at when chain exhausted', () => { + const r = run({}); + expect(r.source).toBe('fallback'); + expect(r.date?.toISOString()).toBe(baseUpdated.toISOString()); + }); +}); + +describe('computeEffectiveDate per-prefix override (daily/, meetings/)', () => { + test('daily/ filename wins over event_date', () => { + const r = run({ + slug: 'daily/2024-03-15', + fm: { event_date: '2024-04-01' }, + filename: '2024-03-15', + }); + expect(r.source).toBe('filename'); + expect(r.date?.toISOString().startsWith('2024-03-15')).toBe(true); + }); + + test('meetings/ filename wins over date', () => { + const r = run({ + slug: 'meetings/2024-06-15-acme-call', + fm: { date: '2024-07-01' }, + filename: '2024-06-15-acme-call', + }); + expect(r.source).toBe('filename'); + expect(r.date?.toISOString().startsWith('2024-06-15')).toBe(true); + }); + + test('daily/ falls through to event_date when filename has no date', () => { + const r = run({ + slug: 'daily/notes', + fm: { event_date: '2024-04-01' }, + filename: 'notes-some-text', + }); + expect(r.source).toBe('event_date'); + expect(r.date?.toISOString().startsWith('2024-04-01')).toBe(true); + }); + + test('non-prefixed slug uses default precedence (event_date over filename)', () => { + const r = run({ + slug: 'wiki/people/widget-ceo', + fm: { event_date: '2024-04-01' }, + filename: '2024-06-15-widget-ceo', + }); + expect(r.source).toBe('event_date'); + expect(r.date?.toISOString().startsWith('2024-04-01')).toBe(true); + }); +}); + +describe('computeEffectiveDate parse failure fall-through', () => { + test('event_date "tomorrow" falls through to date', () => { + const r = run({ fm: { event_date: 'tomorrow', date: '2024-04-01' } }); + expect(r.source).toBe('date'); + expect(r.date?.toISOString().startsWith('2024-04-01')).toBe(true); + }); + + test('all frontmatter dates unparseable → filename wins', () => { + const r = run({ + fm: { event_date: 'garbage', date: 'tomorrow', published: 'last week' }, + filename: '2024-06-15-something', + }); + expect(r.source).toBe('filename'); + expect(r.date?.toISOString().startsWith('2024-06-15')).toBe(true); + }); + + test('filename without date prefix → fallback', () => { + const r = run({ filename: 'no-date-here' }); + expect(r.source).toBe('fallback'); + expect(r.date?.toISOString()).toBe(baseUpdated.toISOString()); + }); +}); + +describe('computeEffectiveDate range validation [1990, NOW + 1y]', () => { + test('pre-1990 frontmatter date drops to next chain element', () => { + const r = run({ fm: { event_date: '1985-01-01', date: '2024-04-01' } }); + expect(r.source).toBe('date'); + }); + + test('far-future frontmatter date drops to next chain element', () => { + // NOW is 2026-05-04 in test fixtures; 2030 is > NOW + 1y + const r = run({ fm: { event_date: '2030-01-01', date: '2024-04-01' } }); + expect(r.source).toBe('date'); + }); + + test('out-of-range filename date drops to fallback', () => { + const r = run({ filename: '1850-01-01-ancient' }); + expect(r.source).toBe('fallback'); + }); +}); From c1734b9319115ce6979ec28988c1a3944e08240f Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 11:15:50 -0700 Subject: [PATCH 04/15] v0.29.1: backfill orchestrator + library function for existing pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/core/backfill-effective-date.ts is the shared library function. Walks pages in keyset-paginated batches (id > last_id ORDER BY id LIMIT 1000), runs computeEffectiveDate per row, UPDATEs effective_date + effective_date_source. Resumable via the `backfill.effective_date.last_id` checkpoint key in the config table — a killed process can re-run and pick up without re-doing rows. Idempotent: a full re-walk produces the same writes. Postgres-only: SET LOCAL statement_timeout = '600s' per batch. Doesn't refuse the migration on low session settings (codex pass-2 #16). src/commands/migrations/v0_29_1.ts is the orchestrator (4 phases mirroring v0_12_2). Phase A schema (gbrain init --migrate-only), Phase B backfill (via the library function), Phase C verify (count NULL effective_date), Phase D record (handled by runner). The library function is reusable from the gbrain reindex-frontmatter CLI command in the next commit. import_filename stays NULL for backfilled rows — pre-v0.29.1 imports didn't capture it. computeEffectiveDate uses the slug-tail when filename is NULL; daily/2024-03-15 backfilled gets effective_date from the slug. Registered in src/commands/migrations/index.ts. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/commands/migrations/index.ts | 2 + src/commands/migrations/v0_29_1.ts | 160 +++++++++++++++++ src/core/backfill-effective-date.ts | 261 ++++++++++++++++++++++++++++ 3 files changed, 423 insertions(+) create mode 100644 src/commands/migrations/v0_29_1.ts create mode 100644 src/core/backfill-effective-date.ts diff --git a/src/commands/migrations/index.ts b/src/commands/migrations/index.ts index 22403b1cb..355445fe4 100644 --- a/src/commands/migrations/index.ts +++ b/src/commands/migrations/index.ts @@ -23,6 +23,7 @@ import { v0_18_1 } from './v0_18_1.ts'; import { v0_21_0 } from './v0_21_0.ts'; import { v0_22_4 } from './v0_22_4.ts'; import { v0_28_0 } from './v0_28_0.ts'; +import { v0_29_1 } from './v0_29_1.ts'; export const migrations: Migration[] = [ v0_11_0, @@ -37,6 +38,7 @@ export const migrations: Migration[] = [ v0_21_0, v0_22_4, v0_28_0, + v0_29_1, ]; /** Look up a migration by exact version string. */ diff --git a/src/commands/migrations/v0_29_1.ts b/src/commands/migrations/v0_29_1.ts new file mode 100644 index 000000000..38b677d6c --- /dev/null +++ b/src/commands/migrations/v0_29_1.ts @@ -0,0 +1,160 @@ +/** + * v0.29.1 migration orchestrator — backfill effective_date for existing + * pages. + * + * Migration v38 added pages.effective_date / effective_date_source / + * import_filename / salience_touched_at as nullable columns. Fresh imports + * post-v0.29.1 populate effective_date via the importer's + * `computeEffectiveDate`. Pre-v0.29.1 rows have NULL until this orchestrator + * walks them. + * + * Phases (all idempotent, resumable): + * A. Schema — `gbrain init --migrate-only` ensures v38 ran. + * B. Backfill — keyset-paginated UPDATE via `backfillEffectiveDate`. + * Resumable via the `backfill.effective_date.last_id` + * checkpoint key in the config table. Statement timeout + * set per-batch (Postgres only). + * C. Verify — count remaining NULL effective_date rows; warn if > 0. + * D. Record — handled by the runner. + */ + +import { execSync } from 'child_process'; +import type { Migration, OrchestratorOpts, OrchestratorResult, OrchestratorPhaseResult } from './types.ts'; +import { childGlobalFlags } from '../../core/cli-options.ts'; + +// ── Phase A — Schema ──────────────────────────────────────── + +function phaseASchema(opts: OrchestratorOpts): OrchestratorPhaseResult { + if (opts.dryRun) return { name: 'schema', status: 'skipped', detail: 'dry-run' }; + try { + execSync('gbrain init --migrate-only' + childGlobalFlags(), { + stdio: 'inherit', + timeout: 600_000, // 10 min — duplicate-heavy installs can be slow + env: process.env, + }); + return { name: 'schema', status: 'complete' }; + } catch (e) { + return { name: 'schema', status: 'failed', detail: e instanceof Error ? e.message : String(e) }; + } +} + +// ── Phase B — Backfill effective_date ─────────────────────── + +async function phaseBBackfill(opts: OrchestratorOpts): Promise { + if (opts.dryRun) return { name: 'backfill_effective_date', status: 'skipped', detail: 'dry-run' }; + try { + const { createEngine } = await import('../../core/engine-factory.ts'); + const { loadConfig, toEngineConfig } = await import('../../core/config.ts'); + const { backfillEffectiveDate } = await import('../../core/backfill-effective-date.ts'); + const cfg = loadConfig(); + if (!cfg) throw new Error('No gbrain config; run `gbrain init` first.'); + const engine = await createEngine(toEngineConfig(cfg)); + + let totalExamined = 0; + let totalUpdated = 0; + + const result = await backfillEffectiveDate(engine, { + onBatch: ({ batch, lastId, rowsTouched, cumulative }) => { + totalExamined = cumulative; + totalUpdated += rowsTouched; + if (batch % 10 === 0) { + process.stderr.write(` [backfill] batch ${batch} | last_id=${lastId} | examined=${cumulative} | updated_so_far=${totalUpdated}\n`); + } + }, + }); + + return { + name: 'backfill_effective_date', + status: 'complete', + detail: `examined=${result.examined} updated=${result.updated} fallback=${result.fallback} dur=${result.durationSec.toFixed(1)}s`, + }; + } catch (e) { + return { name: 'backfill_effective_date', status: 'failed', detail: e instanceof Error ? e.message : String(e) }; + } +} + +// ── Phase C — Verify ──────────────────────────────────────── + +async function phaseCVerify(opts: OrchestratorOpts): Promise { + if (opts.dryRun) return { name: 'verify', status: 'skipped', detail: 'dry-run' }; + try { + const { createEngine } = await import('../../core/engine-factory.ts'); + const { loadConfig, toEngineConfig } = await import('../../core/config.ts'); + const cfg = loadConfig(); + if (!cfg) throw new Error('No gbrain config; run `gbrain init` first.'); + const engine = await createEngine(toEngineConfig(cfg)); + // Count rows where effective_date is still NULL but frontmatter HAS a + // parseable date — those are the rows the backfill should have touched + // but didn't. (Rows that fall through to 'fallback' have non-null + // effective_date already; this catches genuine misses.) + const rows = await engine.executeRaw<{ count: string }>( + `SELECT COUNT(*)::text AS count FROM pages WHERE effective_date IS NULL`, + ); + const remaining = Number(rows[0]?.count ?? 0); + if (remaining > 0) { + return { + name: 'verify', + status: 'failed', + detail: `${remaining} pages still have NULL effective_date (backfill incomplete)`, + }; + } + return { name: 'verify', status: 'complete', detail: '0 pages with NULL effective_date' }; + } catch (e) { + return { name: 'verify', status: 'failed', detail: e instanceof Error ? e.message : String(e) }; + } +} + +// ── Orchestrator ──────────────────────────────────────────── + +async function orchestrator(opts: OrchestratorOpts): Promise { + console.log(''); + console.log('=== v0.29.1 — backfill effective_date for existing pages ==='); + if (opts.dryRun) console.log(' (dry-run; no side effects)'); + console.log(''); + + const phases: OrchestratorPhaseResult[] = []; + + const a = phaseASchema(opts); + phases.push(a); + if (a.status === 'failed') return finalize(phases, 'failed'); + + const b = await phaseBBackfill(opts); + phases.push(b); + if (b.status === 'failed') return finalize(phases, 'partial'); + + const c = await phaseCVerify(opts); + phases.push(c); + + const status: 'complete' | 'partial' | 'failed' = + c.status === 'failed' ? 'partial' : 'complete'; + + return finalize(phases, status); +} + +function finalize(phases: OrchestratorPhaseResult[], status: 'complete' | 'partial' | 'failed'): OrchestratorResult { + return { version: '0.29.1', status, phases }; +} + +export const v0_29_1: Migration = { + version: '0.29.1', + featurePitch: { + headline: 'Recency + salience as two opt-in axes — agent in charge of when to use each', + description: + 'gbrain v0.29.1 adds two new optional ranking axes to the query op: salience ' + + '(emotional_weight + take_count, the "this matters" signal) and recency (per-prefix ' + + 'age decay, the "this is recent" signal). Truly orthogonal — use either, both, or ' + + "neither. The query op's tool description teaches your agent when each makes sense " + + '("current state → on; canonical truth → off") and the agent can override per query. ' + + 'A new pages.effective_date column is computed at import from frontmatter precedence ' + + '(event_date / date / published / filename) and is immune to auto-link updated_at ' + + 'churn. Existing callers (no new params) get UNCHANGED behavior. Run ' + + "`gbrain dream --phase recompute_emotional_weight` once after upgrading.", + }, + orchestrator, +}; + +export const __testing = { + phaseASchema, + phaseBBackfill, + phaseCVerify, +}; diff --git a/src/core/backfill-effective-date.ts b/src/core/backfill-effective-date.ts new file mode 100644 index 000000000..807d45142 --- /dev/null +++ b/src/core/backfill-effective-date.ts @@ -0,0 +1,261 @@ +/** + * v0.29.1 — Backfill effective_date / effective_date_source for existing + * pages. + * + * Migration v38 added the columns; they're NULL for rows imported before + * v0.29.1. This walks every page in keyset-paginated batches, runs the + * `computeEffectiveDate` precedence chain, and UPDATEs in place. + * + * Resumable: stores `last_processed_id` in the `config` table after each + * batch. A killed process can re-run and pick up where it left off without + * re-doing rows. Idempotent: even a full re-walk produces the same writes. + * + * Postgres only sets `SET LOCAL statement_timeout = '600s'` per batch (does + * NOT refuse the migration on low session settings — codex pass-2 #16). + * + * Pure library function — same code path used by the v0_29_1 orchestrator + * AND the `gbrain reindex-frontmatter` CLI command (added in commit 4). + * + * Note: the `import_filename` column stays NULL on backfilled rows. We + * don't have the original filename for pre-v0.29.1 imports (codex pass-1 + * finding #6). For `daily/`/`meetings/` slugs whose filename-derived date + * IS in the slug tail, computeEffectiveDate falls through to the slug-tail + * heuristic via `slug.split('/').pop()` in importFromContent's caller path + * — but the orchestrator passes the slug-tail explicitly here so backfilled + * rows behave the same as fresh imports for those prefixes. + */ + +import type { BrainEngine } from './engine.ts'; +import { computeEffectiveDate } from './effective-date.ts'; +import type { EffectiveDateSource } from './types.ts'; + +const BATCH_SIZE = 1000; +const CHECKPOINT_KEY = 'backfill.effective_date.last_id'; + +export interface BackfillOpts { + /** Limit total rows touched (testing). Undefined = no cap. */ + maxRows?: number; + /** Restart from id=0 even if a checkpoint exists. */ + fresh?: boolean; + /** Don't write; report what would happen. */ + dryRun?: boolean; + /** Per-batch progress callback. */ + onBatch?: (info: { batch: number; lastId: number; rowsTouched: number; cumulative: number }) => void; + /** + * Optional slug-prefix filter (e.g. 'meetings/') so the CLI command can + * scope to a subset. Undefined = no filter. + */ + slugPrefix?: string; + /** + * When true, recompute even if existing effective_date matches what + * the chain would produce. Default false (no-op-on-equal saves writes). + */ + force?: boolean; +} + +export interface BackfillResult { + /** Total rows examined across all batches. */ + examined: number; + /** Rows where effective_date was actually written (changed or newly computed). */ + updated: number; + /** Rows that fell through the chain to 'fallback' (matches updated_at/created_at). */ + fallback: number; + /** Final last_processed_id (for resume / debugging). */ + lastId: number; + /** Total wall-clock seconds. */ + durationSec: number; +} + +interface PageRow { + id: number; + slug: string; + frontmatter: unknown; + import_filename: string | null; + effective_date: string | null; + effective_date_source: EffectiveDateSource | null; + created_at: string; + updated_at: string; +} + +function parseFrontmatter(raw: unknown): Record { + if (raw == null) return {}; + if (typeof raw === 'string') { + try { return JSON.parse(raw) as Record; } + catch { return {}; } + } + if (typeof raw === 'object') return raw as Record; + return {}; +} + +async function getCheckpoint(engine: BrainEngine, fresh: boolean): Promise { + if (fresh) return 0; + try { + const rows = await engine.executeRaw<{ value: string }>( + `SELECT value FROM config WHERE key = $1 LIMIT 1`, + [CHECKPOINT_KEY], + ); + if (rows.length === 0) return 0; + const n = Number(rows[0].value); + return Number.isFinite(n) && n >= 0 ? n : 0; + } catch { + return 0; + } +} + +async function setCheckpoint(engine: BrainEngine, lastId: number): Promise { + try { + await engine.executeRaw( + `INSERT INTO config (key, value) VALUES ($1, $2) + ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value`, + [CHECKPOINT_KEY, String(lastId)], + ); + } catch { + // Best effort. Failure to checkpoint just means re-walk on next run; + // doesn't corrupt state. + } +} + +async function clearCheckpoint(engine: BrainEngine): Promise { + try { + await engine.executeRaw(`DELETE FROM config WHERE key = $1`, [CHECKPOINT_KEY]); + } catch { + // Same — best effort. + } +} + +export async function backfillEffectiveDate( + engine: BrainEngine, + opts: BackfillOpts = {}, +): Promise { + const start = Date.now(); + const slugPrefix = opts.slugPrefix?.replace(/[\\%_]/g, (c) => '\\' + c) ?? null; + + let lastId = await getCheckpoint(engine, opts.fresh ?? false); + let examined = 0; + let updated = 0; + let fallback = 0; + let batchNum = 0; + + // Per-engine statement_timeout boost. Postgres can wedge on a slow + // batch otherwise; PGLite ignores SET LOCAL outside transactions but + // doesn't have the timeout problem in the first place (single writer). + const isPostgres = engine.kind === 'postgres'; + + while (true) { + if (opts.maxRows && examined >= opts.maxRows) break; + + const limit = opts.maxRows + ? Math.min(BATCH_SIZE, opts.maxRows - examined) + : BATCH_SIZE; + + // Keyset pagination: WHERE id > last_id ORDER BY id LIMIT N. Single-direction + // walk; safe under concurrent inserts (new rows show up at the tail). + const slugFilter = slugPrefix + ? `AND slug LIKE $2 ESCAPE '\\\\'` + : ''; + const params: unknown[] = [lastId]; + if (slugPrefix) params.push(slugPrefix + '%'); + params.push(limit); + const limitParam = `$${params.length}`; + + const rows = await engine.executeRaw( + `SELECT id, slug, frontmatter, import_filename, effective_date, effective_date_source, created_at, updated_at + FROM pages + WHERE id > $1 ${slugFilter} + ORDER BY id + LIMIT ${limitParam}`, + params, + ); + + if (rows.length === 0) break; + + examined += rows.length; + let touched = 0; + + if (!opts.dryRun) { + // Compute effective_date for each row, then UPDATE in a batch wrapped + // in its own transaction (so SET LOCAL statement_timeout scopes to it). + // postgres.js's `transaction` would be cleaner but we're using executeRaw + // for engine portability; explicit BEGIN/COMMIT does the same on both. + if (isPostgres) { + await engine.executeRaw(`BEGIN`); + await engine.executeRaw(`SET LOCAL statement_timeout = '600s'`); + } + + try { + for (const r of rows) { + const fm = parseFrontmatter(r.frontmatter); + const filename = r.import_filename + || (r.slug.includes('/') ? r.slug.split('/').pop()! : r.slug); + const computed = computeEffectiveDate({ + slug: r.slug, + frontmatter: fm, + filename, + updatedAt: new Date(r.updated_at), + createdAt: new Date(r.created_at), + }); + + // No-op-on-equal: skip the UPDATE if existing matches (saves write + // amplification on re-runs). `force: true` bypasses. + const existingMs = r.effective_date ? new Date(r.effective_date).getTime() : null; + const computedMs = computed.date ? computed.date.getTime() : null; + const datesMatch = existingMs === computedMs; + const sourcesMatch = (r.effective_date_source ?? null) === (computed.source ?? null); + + if (!opts.force && datesMatch && sourcesMatch) continue; + + await engine.executeRaw( + `UPDATE pages SET effective_date = $1::timestamptz, effective_date_source = $2 WHERE id = $3`, + [computed.date ? computed.date.toISOString() : null, computed.source, r.id], + ); + touched++; + if (computed.source === 'fallback') fallback++; + } + + if (isPostgres) await engine.executeRaw(`COMMIT`); + } catch (e) { + if (isPostgres) { + try { await engine.executeRaw(`ROLLBACK`); } catch { /* ignore */ } + } + throw e; + } + } else { + // Dry run: still count what WOULD change. + for (const r of rows) { + const fm = parseFrontmatter(r.frontmatter); + const filename = r.import_filename + || (r.slug.includes('/') ? r.slug.split('/').pop()! : r.slug); + const computed = computeEffectiveDate({ + slug: r.slug, + frontmatter: fm, + filename, + updatedAt: new Date(r.updated_at), + createdAt: new Date(r.created_at), + }); + const existingMs = r.effective_date ? new Date(r.effective_date).getTime() : null; + const computedMs = computed.date ? computed.date.getTime() : null; + if (existingMs !== computedMs || (r.effective_date_source ?? null) !== (computed.source ?? null)) { + touched++; + } + if (computed.source === 'fallback') fallback++; + } + } + + updated += touched; + lastId = rows[rows.length - 1].id; + batchNum++; + if (!opts.dryRun) await setCheckpoint(engine, lastId); + opts.onBatch?.({ batch: batchNum, lastId, rowsTouched: touched, cumulative: examined }); + } + + // Walk done; clear the checkpoint so the next manual run starts fresh. + if (!opts.dryRun) await clearCheckpoint(engine); + + return { + examined, + updated, + fallback, + lastId, + durationSec: (Date.now() - start) / 1000, + }; +} From 228af0f26ba18a212f7f920d2c5ec2bb41d218e1 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 11:18:28 -0700 Subject: [PATCH 05/15] v0.29.1: gbrain reindex-frontmatter CLI command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recovery / explicit-rebuild path for pages.effective_date. Used when: - User edited frontmatter dates after import - Post-upgrade backfill orchestrator finished but the user wants to re-walk a subset (e.g. just meetings/) after fixing some frontmatter - Precedence rules change between releases Thin wrapper over backfillEffectiveDate from commit 3 — same code path the v0_29_1 orchestrator uses; one source of truth. Flags mirror reindex-code: --source Scope to one sources row (placeholder; library library doesn't filter by source today, tracked v0.30+) --slug-prefix P Scope to slugs starting with P (e.g. 'meetings/') --dry-run Print what WOULD change, no DB writes --yes Skip confirmation prompt (required for non-TTY non-JSON) --json Machine-readable result envelope --force Re-apply even when computed value matches existing Wired into src/cli.ts. CLI handles its own engine lifecycle (creates + disconnects). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/cli.ts | 12 +- src/commands/reindex-frontmatter.ts | 186 ++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 src/commands/reindex-frontmatter.ts diff --git a/src/cli.ts b/src/cli.ts index 7c46852de..aee033204 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -19,7 +19,7 @@ for (const op of operations) { } // CLI-only commands that bypass the operation layer -const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'agent', 'apply-migrations', 'skillpack-check', 'skillpack', 'resolvers', 'integrity', 'repair-jsonb', 'orphans', 'sources', 'mounts', 'dream', 'check-resolvable', 'routing-eval', 'skillify', 'smoke-test', 'storage', 'repos', 'code-def', 'code-refs', 'reindex-code', 'code-callers', 'code-callees', 'frontmatter', 'auth', 'friction', 'claw-test', 'book-mirror', 'takes', 'think', 'salience', 'anomalies', 'transcripts']); +const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'agent', 'apply-migrations', 'skillpack-check', 'skillpack', 'resolvers', 'integrity', 'repair-jsonb', 'orphans', 'sources', 'mounts', 'dream', 'check-resolvable', 'routing-eval', 'skillify', 'smoke-test', 'storage', 'repos', 'code-def', 'code-refs', 'reindex-code', 'reindex-frontmatter', 'code-callers', 'code-callees', 'frontmatter', 'auth', 'friction', 'claw-test', 'book-mirror', 'takes', 'think', 'salience', 'anomalies', 'transcripts']); async function main() { // Parse global flags (--quiet / --progress-json / --progress-interval) @@ -610,6 +610,16 @@ async function handleCliOnly(command: string, args: string[]) { await runReindexCodeCli(engine, args); break; } + case 'reindex-frontmatter': { + // v0.29.1: recovery / explicit-rebuild path for pages.effective_date. + // Mirror of reindex-code shape. Wraps the shared library function in + // src/core/backfill-effective-date.ts (same code path the v0.29.1 + // migration orchestrator uses). The orchestrator runs once on + // upgrade; this command is for after-the-fact frontmatter edits. + const { reindexFrontmatterCli } = await import('./commands/reindex-frontmatter.ts'); + await reindexFrontmatterCli(args); + return; // reindexFrontmatterCli handles its own engine lifecycle + } case 'code-callers': { // v0.20.0 Cathedral II Layer 10 (C4): "who calls ?" const { runCodeCallers } = await import('./commands/code-callers.ts'); diff --git a/src/commands/reindex-frontmatter.ts b/src/commands/reindex-frontmatter.ts new file mode 100644 index 000000000..6331c13b6 --- /dev/null +++ b/src/commands/reindex-frontmatter.ts @@ -0,0 +1,186 @@ +/** + * v0.29.1 — `gbrain reindex-frontmatter`. + * + * Recovery / explicit-rebuild path for `pages.effective_date`. Useful when: + * - The user edited frontmatter dates after import and wants the effective_date + * column refreshed without a full `gbrain sync`. + * - The post-upgrade backfill orchestrator finished but the user wants to + * re-walk a subset (e.g. just `meetings/`) after fixing some frontmatter. + * - The precedence rules change between releases and the user wants to + * re-apply on existing rows. + * + * Thin wrapper over the shared library function in + * `src/core/backfill-effective-date.ts` (same code path the migration + * orchestrator uses; one source of truth for the backfill logic). + * + * Flags mirror `reindex-code`: + * --source Scope to one sources row. Omit = all pages. + * --slug-prefix P Scope to slugs starting with P (e.g. 'meetings/'). + * --dry-run Print what WOULD change, no DB writes. + * --yes Skip the confirmation prompt (required for non-TTY non-JSON). + * --json Machine-readable result envelope. + * --force Re-apply even when computed value matches existing + * (bypasses no-op-on-equal guard). + */ + +import type { BrainEngine } from '../core/engine.ts'; +import { backfillEffectiveDate } from '../core/backfill-effective-date.ts'; +import { createInterface } from 'readline'; + +export interface ReindexFrontmatterOpts { + sourceId?: string; + slugPrefix?: string; + dryRun?: boolean; + yes?: boolean; + json?: boolean; + force?: boolean; +} + +export interface ReindexFrontmatterResult { + status: 'ok' | 'dry_run' | 'cancelled'; + examined: number; + updated: number; + fallback: number; + durationSec: number; + source_filter?: string; + slug_prefix?: string; +} + +async function countAffected( + engine: BrainEngine, + slugPrefix: string | undefined, + sourceId: string | undefined, +): Promise { + const where: string[] = []; + const params: unknown[] = []; + if (slugPrefix) { + params.push(slugPrefix.replace(/[\\%_]/g, (c) => '\\' + c) + '%'); + where.push(`slug LIKE $${params.length} ESCAPE '\\\\'`); + } + if (sourceId) { + params.push(sourceId); + where.push(`source_id = $${params.length}`); + } + const sql = `SELECT COUNT(*)::text AS n FROM pages${where.length ? ' WHERE ' + where.join(' AND ') : ''}`; + const rows = await engine.executeRaw<{ n: string }>(sql, params); + return Number(rows[0]?.n ?? 0); +} + +async function confirm(prompt: string): Promise { + if (!process.stdin.isTTY) return false; // No TTY = require --yes + const rl = createInterface({ input: process.stdin, output: process.stderr }); + return new Promise(resolve => { + rl.question(prompt + ' [y/N] ', (ans: string) => { + rl.close(); + resolve(ans.trim().toLowerCase() === 'y'); + }); + }); +} + +export async function runReindexFrontmatter( + engine: BrainEngine, + opts: ReindexFrontmatterOpts, +): Promise { + const total = await countAffected(engine, opts.slugPrefix, opts.sourceId); + + if (opts.dryRun) { + // Library function with dryRun=true counts would-update without writing. + const r = await backfillEffectiveDate(engine, { + slugPrefix: opts.slugPrefix, + dryRun: true, + force: opts.force, + // Note: the library doesn't support sourceId filter today; documented + // as a v0.30+ enhancement. CLI surfaces the param so the future + // refinement is non-breaking. + maxRows: total > 0 ? total : undefined, + }); + return { + status: 'dry_run', + examined: r.examined, + updated: r.updated, + fallback: r.fallback, + durationSec: r.durationSec, + slug_prefix: opts.slugPrefix, + source_filter: opts.sourceId, + }; + } + + // Confirm in TTY non-yes flow. + if (!opts.yes && !opts.json && total > 100) { + const ok = await confirm(`Reindex effective_date on ${total} page(s)? Force=${opts.force ? 'yes' : 'no'}.`); + if (!ok) { + return { + status: 'cancelled', + examined: 0, updated: 0, fallback: 0, durationSec: 0, + slug_prefix: opts.slugPrefix, + source_filter: opts.sourceId, + }; + } + } + + const r = await backfillEffectiveDate(engine, { + slugPrefix: opts.slugPrefix, + force: opts.force, + fresh: true, // CLI is explicit; ignore checkpoint from prior orchestrator runs + onBatch: ({ batch, lastId, rowsTouched, cumulative }) => { + if (!opts.json && batch % 5 === 0) { + process.stderr.write(` [reindex] batch ${batch} | last_id=${lastId} | examined=${cumulative} | updated=${rowsTouched}\n`); + } + }, + }); + + return { + status: 'ok', + examined: r.examined, + updated: r.updated, + fallback: r.fallback, + durationSec: r.durationSec, + slug_prefix: opts.slugPrefix, + source_filter: opts.sourceId, + }; +} + +/** CLI entrypoint. Argv shape matches reindex-code for consistency. */ +export async function reindexFrontmatterCli(args: string[]): Promise { + const opts: ReindexFrontmatterOpts = {}; + for (let i = 0; i < args.length; i++) { + const a = args[i]; + if (a === '--source') opts.sourceId = args[++i]; + else if (a === '--slug-prefix') opts.slugPrefix = args[++i]; + else if (a === '--dry-run') opts.dryRun = true; + else if (a === '--yes' || a === '-y') opts.yes = true; + else if (a === '--json') opts.json = true; + else if (a === '--force') opts.force = true; + else { + console.error(`Unknown arg: ${a}`); + process.exit(2); + } + } + + const { createEngine } = await import('../core/engine-factory.ts'); + const { loadConfig, toEngineConfig } = await import('../core/config.ts'); + const cfg = loadConfig(); + if (!cfg) { + console.error('No gbrain config; run `gbrain init` first.'); + process.exit(1); + } + const engine = await createEngine(toEngineConfig(cfg)); + + try { + const result = await runReindexFrontmatter(engine, opts); + if (opts.json) { + console.log(JSON.stringify(result, null, 2)); + } else { + const noun = result.status === 'dry_run' ? 'would update' : 'updated'; + console.error( + `\nReindex ${result.status}: examined=${result.examined} ${noun}=${result.updated} ` + + `fallback=${result.fallback} dur=${result.durationSec.toFixed(1)}s`, + ); + } + if (result.status === 'cancelled') process.exit(1); + } finally { + if ('disconnect' in engine && typeof engine.disconnect === 'function') { + await engine.disconnect(); + } + } +} From 307fd304de247b7383435a53bd525ecbb317b509 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 11:21:25 -0700 Subject: [PATCH 06/15] v0.29.1: recency-decay map + buildRecencyComponentSql (pure, unused) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/core/search/recency-decay.ts mirrors source-boost.ts in shape but drives RECENCY ONLY (per D9 codex resolution). Salience is a separate orthogonal axis; this map does not feed it. DEFAULT_RECENCY_DECAY: 10 generic prefixes (no fork-specific names). - concepts/ evergreen (halflifeDays=0) - originals/ 180d × 0.5 (long-tail decay; new essays nudged) - writing/ 365d × 0.4 - daily/ 14d × 1.5 (aggressive — freshness IS the signal) - meetings/ 60d × 1.0 - chat/ 7d × 1.0 - media/x/ 7d × 1.5 - media/articles/ 90d × 0.5 - people/companies/ 365d × 0.3 - deals/ 180d × 0.5 DEFAULT_FALLBACK: 90d × 0.5 for unmatched slugs. Override priority: defaults < gbrain.yml recency: < env (GBRAIN_RECENCY_DECAY) < per-call SearchOpts.recency_decay. parseRecencyDecayEnv format: comma-separated prefix:halflifeDays:coefficient triples. Refuses LOUD on parse error (RecencyDecayParseError) — codex pass-2 #M3 finding. No silent fallback like source-boost's parser. parseRecencyDecayYaml takes already-parsed YAML; throws on bad shape. buildRecencyComponentSql in sql-ranking.ts emits a CASE expression with longest-prefix-first ordering, evergreen short-circuit (literal 0 when halflifeDays=0 or coefficient=0), and EXTRACT(EPOCH ...) for non-zero branches. Output: ((CASE WHEN p.slug LIKE 'daily/%' THEN 1.5 * 14.0 / (14.0 + EXTRACT(EPOCH FROM (NOW() - ))/86400.0) ... END)) Typed NowExpr enum prevents SQL injection (codex pass-1 #5). Tests pass { kind: 'fixed', isoUtc } for deterministic output; production NOW(). The 'fixed' branch escapes single quotes via escapeSqlLiteral. 25 unit tests covering: env parser shape, env error cases, yaml parser shape, merge precedence (defaults < yaml < env < caller), CASE longest- prefix-first ordering, evergreen short-circuit, NowExpr fixed/now, single-quote injection defense, empty decayMap fallback path, default map composition (no fork names, concepts/ evergreen, daily/ aggressive). Pure module. Zero consumers in this commit; commit 6 wires it into getRecentSalience, commit 10 wires it into the post-fusion stage. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/search/recency-decay.ts | 201 ++++++++++++++++++++++++++++ src/core/search/sql-ranking.ts | 82 ++++++++++++ test/recency-decay.test.ts | 221 +++++++++++++++++++++++++++++++ 3 files changed, 504 insertions(+) create mode 100644 src/core/search/recency-decay.ts create mode 100644 test/recency-decay.test.ts diff --git a/src/core/search/recency-decay.ts b/src/core/search/recency-decay.ts new file mode 100644 index 000000000..7e8c83af4 --- /dev/null +++ b/src/core/search/recency-decay.ts @@ -0,0 +1,201 @@ +/** + * v0.29.1 — Per-prefix recency decay map. + * + * Drives the recency boost ONLY (per D9 codex resolution). Salience is a + * separate orthogonal axis based on emotional_weight + take_count and + * does NOT consume this map. The two axes compose multiplicatively in + * runPostFusionStages when both opt in. + * + * Keyed by slug prefix. Longest-prefix-match wins (sorted at lookup time + * inside sql-ranking.ts). Defaults are GENERIC prefixes only (no fork- + * specific names like 'openclaw/chat/' — that's a privacy violation per + * CLAUDE.md and tracked in iteration-1 codex finding C-CX-3). + * + * Override priority (later wins): + * 1. DEFAULT_RECENCY_DECAY (this file) + * 2. gbrain.yml `recency:` section + * 3. GBRAIN_RECENCY_DECAY env var (prefix:halflifeDays:coefficient,...) + * 4. Per-call SearchOpts.recency_decay (tests + library consumers; not + * exposed on MCP) + * + * Per-prefix interpretation: + * - halflifeDays = 0 → evergreen, no decay (recency component = 0) + * - halflifeDays > 0 → hyperbolic decay; coefficient × halflife / (halflife + days_old) + * - At days_old=0: recency component = coefficient (max boost) + * - At days_old=halflife: recency component = coefficient / 2 + * + * Pure module. No side effects. Tested in test/recency-decay.test.ts. + */ + +export interface RecencyDecayConfig { + /** Days at which the recency component is halved. 0 = no decay (evergreen). */ + halflifeDays: number; + /** Max recency boost contribution at days_old = 0. Must be >= 0. */ + coefficient: number; +} + +export type RecencyDecayMap = Record; + +export const DEFAULT_RECENCY_DECAY: RecencyDecayMap = { + // Evergreen (curated, opinion, knowledge artifacts) — no decay. + // concepts/ is the canonical evergreen tier; originals/ + writing/ get + // long-tail decay so freshly-published essays do see a small nudge. + 'concepts/': { halflifeDays: 0, coefficient: 0 }, + 'originals/': { halflifeDays: 180, coefficient: 0.5 }, + 'writing/': { halflifeDays: 365, coefficient: 0.4 }, + + // Time-bound personal records — strongest decay, biggest coefficient. + // The user is asking "what was on my plate this week" / "what did we + // discuss in our 1:1"; freshness IS the signal. + 'daily/': { halflifeDays: 14, coefficient: 1.5 }, + 'meetings/': { halflifeDays: 60, coefficient: 1.0 }, + + // Bulk feeds — generic prefixes only. Real fork names go in user + // gbrain.yml, never in shipped defaults. + 'chat/': { halflifeDays: 7, coefficient: 1.0 }, + 'media/x/': { halflifeDays: 7, coefficient: 1.5 }, + 'media/articles/': { halflifeDays: 90, coefficient: 0.5 }, + + // Entities — slow decay (a deal from 2 years ago is still relevant + // to a current portfolio query; less so to "what's new lately"). + 'people/': { halflifeDays: 365, coefficient: 0.3 }, + 'companies/': { halflifeDays: 365, coefficient: 0.3 }, + 'deals/': { halflifeDays: 180, coefficient: 0.5 }, +}; + +/** Fallback applied to slugs that don't match any default or override prefix. */ +export const DEFAULT_FALLBACK: RecencyDecayConfig = { + halflifeDays: 90, + coefficient: 0.5, +}; + +/** Sentinel error thrown by parsers; CLI catches it and exits with a useful message. */ +export class RecencyDecayParseError extends Error { + constructor(message: string, public readonly source: 'env' | 'yaml' | 'caller') { + super(message); + this.name = 'RecencyDecayParseError'; + } +} + +/** + * Parse the GBRAIN_RECENCY_DECAY env var. + * Format: comma-separated `prefix:halflifeDays:coefficient` triples. + * Example: "daily/:7:2.0,concepts/:0:0,custom/:30:1.0" + * + * Refuses on parse error (codex M-CX-3 / iteration-2 review). The source-boost + * env parser silently skipped malformed entries; that pattern bit users for + * years. Recency parser fails LOUD so misconfigurations surface at startup + * instead of silently degrading rankings. + */ +export function parseRecencyDecayEnv(env: string | undefined): RecencyDecayMap { + if (!env) return {}; + const out: RecencyDecayMap = {}; + const triples = env.split(',').map(s => s.trim()).filter(Boolean); + for (const triple of triples) { + // Prefix can't contain `:` because the field separator is `:`. We split + // on the FIRST and SECOND `:` from the right so the prefix may safely + // contain `/` etc. but NOT colons. + const lastIdx = triple.lastIndexOf(':'); + if (lastIdx <= 0) { + throw new RecencyDecayParseError( + `Invalid GBRAIN_RECENCY_DECAY entry "${triple}": expected prefix:halflife:coefficient`, + 'env', + ); + } + const beforeLast = triple.slice(0, lastIdx); + const middleIdx = beforeLast.lastIndexOf(':'); + if (middleIdx <= 0) { + throw new RecencyDecayParseError( + `Invalid GBRAIN_RECENCY_DECAY entry "${triple}": expected prefix:halflife:coefficient`, + 'env', + ); + } + const prefix = triple.slice(0, middleIdx).trim(); + const halflifeRaw = triple.slice(middleIdx + 1, lastIdx).trim(); + const coefficientRaw = triple.slice(lastIdx + 1).trim(); + const halflife = Number.parseFloat(halflifeRaw); + const coefficient = Number.parseFloat(coefficientRaw); + if (!prefix) { + throw new RecencyDecayParseError(`Empty prefix in GBRAIN_RECENCY_DECAY entry "${triple}"`, 'env'); + } + if (!Number.isFinite(halflife) || halflife < 0) { + throw new RecencyDecayParseError( + `Invalid halflifeDays "${halflifeRaw}" in GBRAIN_RECENCY_DECAY (must be number >= 0; 0 = evergreen)`, + 'env', + ); + } + if (!Number.isFinite(coefficient) || coefficient < 0) { + throw new RecencyDecayParseError( + `Invalid coefficient "${coefficientRaw}" in GBRAIN_RECENCY_DECAY (must be number >= 0)`, + 'env', + ); + } + out[prefix] = { halflifeDays: halflife, coefficient }; + } + return out; +} + +/** + * Parse a `recency:` section from a parsed gbrain.yml. The shape is: + * recency: + * daily/: { halflifeDays: 14, coefficient: 1.5 } + * concepts/: { halflifeDays: 0, coefficient: 0 } + * + * `parsed` is the already-parsed YAML object. This is a pure transform. + * Caller is responsible for reading + parsing the YAML file. + */ +export function parseRecencyDecayYaml(parsed: unknown): RecencyDecayMap { + if (parsed == null) return {}; + if (typeof parsed !== 'object' || Array.isArray(parsed)) return {}; + const obj = parsed as Record; + const recency = obj.recency; + if (recency == null) return {}; + if (typeof recency !== 'object' || Array.isArray(recency)) { + throw new RecencyDecayParseError(`gbrain.yml recency: must be a map, got ${typeof recency}`, 'yaml'); + } + const out: RecencyDecayMap = {}; + for (const [prefix, raw] of Object.entries(recency as Record)) { + if (typeof raw !== 'object' || raw === null || Array.isArray(raw)) { + throw new RecencyDecayParseError( + `gbrain.yml recency."${prefix}" must be an object with halflifeDays + coefficient`, + 'yaml', + ); + } + const cfg = raw as Record; + const halflife = Number(cfg.halflifeDays); + const coefficient = Number(cfg.coefficient); + if (!Number.isFinite(halflife) || halflife < 0) { + throw new RecencyDecayParseError( + `gbrain.yml recency."${prefix}".halflifeDays invalid (must be number >= 0)`, + 'yaml', + ); + } + if (!Number.isFinite(coefficient) || coefficient < 0) { + throw new RecencyDecayParseError( + `gbrain.yml recency."${prefix}".coefficient invalid (must be number >= 0)`, + 'yaml', + ); + } + out[prefix] = { halflifeDays: halflife, coefficient }; + } + return out; +} + +/** + * Merge defaults + yaml + env + caller-supplied overrides into the effective + * decay map. Later sources win. Empty entries are dropped. + */ +export function resolveRecencyDecayMap(opts: { + yaml?: unknown; + envValue?: string; + caller?: RecencyDecayMap; +} = {}): RecencyDecayMap { + const fromYaml = opts.yaml !== undefined ? parseRecencyDecayYaml(opts.yaml) : {}; + const fromEnv = parseRecencyDecayEnv(opts.envValue ?? process.env.GBRAIN_RECENCY_DECAY); + return { + ...DEFAULT_RECENCY_DECAY, + ...fromYaml, + ...fromEnv, + ...(opts.caller ?? {}), + }; +} diff --git a/src/core/search/sql-ranking.ts b/src/core/search/sql-ranking.ts index 0feca067b..7dfff0828 100644 --- a/src/core/search/sql-ranking.ts +++ b/src/core/search/sql-ranking.ts @@ -129,5 +129,87 @@ export function buildVisibilityClause(pageAlias: string, sourceAlias: string): s return `AND ${pageAlias}.deleted_at IS NULL AND NOT ${sourceAlias}.archived`; } +// ============================================================ +// v0.29.1 — Recency component SQL builder +// ============================================================ + +/** + * Typed expression for "what NOW() should be" in the SQL. Tests pass + * `{ kind: 'fixed', isoUtc }` for deterministic output regardless of wall + * clock. Production callers leave it default (`{ kind: 'now' }`). + * + * The builder constructs the SQL literal internally via escapeSqlLiteral + * for the 'fixed' branch — caller-supplied strings NEVER flow into raw SQL, + * preventing the injection vector codex pass-1 #5 flagged. + */ +export type NowExpr = { kind: 'now' } | { kind: 'fixed'; isoUtc: string }; + +function nowExprToSql(now: NowExpr): string { + if (now.kind === 'now') return 'NOW()'; + return `'${escapeSqlLiteral(now.isoUtc)}'::timestamptz`; +} + +/** + * Build the per-row recency component SQL fragment. + * + * For each prefix in the decay map, emit one CASE branch: + * - halflifeDays = 0 (or coefficient = 0) → literal 0 (evergreen short-circuit) + * - halflifeDays > 0 → coefficient * halflife / (halflife + days_old) + * + * Prefixes sorted longest-first so 'media/articles/' matches before 'media/' + * (mirror of buildSourceFactorCase's ordering). + * + * Output is a single SQL expression suitable for SELECT / ORDER BY. + * + * @param slugColumn — qualified column reference (engine-supplied, trusted) + * @param dateExpr — qualified expression for the page's effective date + * (typically `COALESCE(p.effective_date, p.updated_at)`) + * @param decayMap — per-prefix configurations (resolved from defaults + + * yaml + env + caller) + * @param fallback — applied to slugs matching no prefix + * @param now — typed NOW() expression (default `{ kind: 'now' }`) + */ +export function buildRecencyComponentSql(opts: { + slugColumn: string; + dateExpr: string; + decayMap: import('./recency-decay.ts').RecencyDecayMap; + fallback: import('./recency-decay.ts').RecencyDecayConfig; + now?: NowExpr; +}): string { + const { slugColumn, dateExpr, decayMap, fallback } = opts; + const now = opts.now ?? { kind: 'now' }; + const nowSql = nowExprToSql(now); + const daysOldSql = `EXTRACT(EPOCH FROM (${nowSql} - ${dateExpr})) / 86400.0`; + + const prefixes = Object.keys(decayMap).sort((a, b) => b.length - a.length); + const branches: string[] = []; + + for (const prefix of prefixes) { + const cfg = decayMap[prefix]; + const literal = buildLikePrefixLiteral(prefix); + if (cfg.halflifeDays === 0 || cfg.coefficient === 0) { + branches.push(`WHEN ${slugColumn} LIKE ${literal} THEN 0`); + } else { + const h = cfg.halflifeDays; + const c = cfg.coefficient; + branches.push( + `WHEN ${slugColumn} LIKE ${literal} THEN ${c} * ${h}.0 / (${h}.0 + ${daysOldSql})`, + ); + } + } + + let elseSql: string; + if (fallback.halflifeDays === 0 || fallback.coefficient === 0) { + elseSql = '0'; + } else { + const h = fallback.halflifeDays; + const c = fallback.coefficient; + elseSql = `${c} * ${h}.0 / (${h}.0 + ${daysOldSql})`; + } + + if (branches.length === 0) return `(${elseSql})`; + return `(CASE ${branches.join(' ')} ELSE ${elseSql} END)`; +} + // Exported for unit tests export const __test__ = { escapeLikePattern, escapeSqlLiteral, buildLikePrefixLiteral }; diff --git a/test/recency-decay.test.ts b/test/recency-decay.test.ts new file mode 100644 index 000000000..81f58af00 --- /dev/null +++ b/test/recency-decay.test.ts @@ -0,0 +1,221 @@ +/** + * v0.29.1 — recency-decay map + buildRecencyComponentSql tests. + * + * Pure functions, no DB. Fast unit tests. Cover the full env / yaml / merge + * resolution chain plus the SQL CASE shape (longest-prefix-match, evergreen + * short-circuit, injection-safe NowExpr). + */ + +import { describe, test, expect } from 'bun:test'; +import { + DEFAULT_RECENCY_DECAY, + DEFAULT_FALLBACK, + RecencyDecayParseError, + parseRecencyDecayEnv, + parseRecencyDecayYaml, + resolveRecencyDecayMap, +} from '../src/core/search/recency-decay.ts'; +import { buildRecencyComponentSql } from '../src/core/search/sql-ranking.ts'; + +describe('parseRecencyDecayEnv', () => { + test('empty / undefined → empty map', () => { + expect(parseRecencyDecayEnv(undefined)).toEqual({}); + expect(parseRecencyDecayEnv('')).toEqual({}); + }); + + test('single triple', () => { + expect(parseRecencyDecayEnv('daily/:7:1.5')).toEqual({ + 'daily/': { halflifeDays: 7, coefficient: 1.5 }, + }); + }); + + test('multiple triples comma-separated', () => { + const out = parseRecencyDecayEnv('daily/:7:1.5,concepts/:0:0,custom/:30:0.5'); + expect(out['daily/']).toEqual({ halflifeDays: 7, coefficient: 1.5 }); + expect(out['concepts/']).toEqual({ halflifeDays: 0, coefficient: 0 }); + expect(out['custom/']).toEqual({ halflifeDays: 30, coefficient: 0.5 }); + }); + + test('throws on missing field', () => { + expect(() => parseRecencyDecayEnv('daily/:7')).toThrow(RecencyDecayParseError); + expect(() => parseRecencyDecayEnv('daily/')).toThrow(RecencyDecayParseError); + }); + + test('throws on negative halflife', () => { + expect(() => parseRecencyDecayEnv('daily/:-1:1.5')).toThrow(RecencyDecayParseError); + }); + + test('throws on negative coefficient', () => { + expect(() => parseRecencyDecayEnv('daily/:7:-0.1')).toThrow(RecencyDecayParseError); + }); + + test('throws on non-numeric values', () => { + expect(() => parseRecencyDecayEnv('daily/:abc:1.5')).toThrow(RecencyDecayParseError); + }); + + test('throws on empty prefix', () => { + expect(() => parseRecencyDecayEnv(':7:1.5')).toThrow(RecencyDecayParseError); + }); +}); + +describe('parseRecencyDecayYaml', () => { + test('null / undefined / empty → empty map', () => { + expect(parseRecencyDecayYaml(null)).toEqual({}); + expect(parseRecencyDecayYaml(undefined)).toEqual({}); + expect(parseRecencyDecayYaml({})).toEqual({}); + }); + + test('valid recency block', () => { + const out = parseRecencyDecayYaml({ + recency: { + 'daily/': { halflifeDays: 14, coefficient: 1.5 }, + 'concepts/': { halflifeDays: 0, coefficient: 0 }, + }, + }); + expect(out['daily/']).toEqual({ halflifeDays: 14, coefficient: 1.5 }); + expect(out['concepts/']).toEqual({ halflifeDays: 0, coefficient: 0 }); + }); + + test('throws on bad halflifeDays', () => { + expect(() => + parseRecencyDecayYaml({ recency: { 'daily/': { halflifeDays: -1, coefficient: 1.0 } } }), + ).toThrow(RecencyDecayParseError); + }); + + test('throws on non-object entry', () => { + expect(() => + parseRecencyDecayYaml({ recency: { 'daily/': 'invalid' } }), + ).toThrow(RecencyDecayParseError); + }); +}); + +describe('resolveRecencyDecayMap merge precedence', () => { + test('defaults baseline', () => { + const m = resolveRecencyDecayMap({}); + expect(m['concepts/']).toEqual({ halflifeDays: 0, coefficient: 0 }); + expect(m['daily/']).toEqual({ halflifeDays: 14, coefficient: 1.5 }); + }); + + test('env overrides defaults', () => { + const m = resolveRecencyDecayMap({ envValue: 'daily/:30:0.5' }); + expect(m['daily/']).toEqual({ halflifeDays: 30, coefficient: 0.5 }); + }); + + test('yaml + env: env wins', () => { + const m = resolveRecencyDecayMap({ + yaml: { recency: { 'daily/': { halflifeDays: 7, coefficient: 2.0 } } }, + envValue: 'daily/:30:0.5', + }); + expect(m['daily/']).toEqual({ halflifeDays: 30, coefficient: 0.5 }); + }); + + test('caller wins over env', () => { + const m = resolveRecencyDecayMap({ + envValue: 'daily/:30:0.5', + caller: { 'daily/': { halflifeDays: 1, coefficient: 5.0 } }, + }); + expect(m['daily/']).toEqual({ halflifeDays: 1, coefficient: 5.0 }); + }); +}); + +describe('buildRecencyComponentSql', () => { + const mini = { + 'concepts/': { halflifeDays: 0, coefficient: 0 }, + 'daily/': { halflifeDays: 14, coefficient: 1.5 }, + 'media/': { halflifeDays: 90, coefficient: 0.5 }, + }; + + test('emits CASE expression with longest-prefix-first ordering', () => { + const longerFirst = { + 'media/articles/': { halflifeDays: 60, coefficient: 0.5 }, + 'media/': { halflifeDays: 90, coefficient: 0.4 }, + }; + const sql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: longerFirst, + fallback: DEFAULT_FALLBACK, + }); + const idxLong = sql.indexOf("'media/articles/%'"); + const idxShort = sql.indexOf("'media/%'"); + expect(idxLong).toBeGreaterThan(0); + expect(idxShort).toBeGreaterThan(0); + expect(idxLong).toBeLessThan(idxShort); + }); + + test('evergreen short-circuit emits literal 0', () => { + const sql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: mini, + fallback: DEFAULT_FALLBACK, + }); + expect(sql).toContain("WHEN p.slug LIKE 'concepts/%' THEN 0"); + }); + + test('non-zero branches include EXTRACT(EPOCH ...)', () => { + const sql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: mini, + fallback: DEFAULT_FALLBACK, + }); + expect(sql).toContain('EXTRACT(EPOCH FROM (NOW() - p.updated_at)) / 86400.0'); + expect(sql).toContain('1.5 * 14.0 / (14.0 + EXTRACT(EPOCH'); + }); + + test('NowExpr.fixed is escaped (single-quote doubling) and timestamptz-cast', () => { + const sql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: { 'daily/': { halflifeDays: 7, coefficient: 1.0 } }, + fallback: DEFAULT_FALLBACK, + now: { kind: 'fixed', isoUtc: "2026-05-04T00:00:00Z" }, + }); + expect(sql).toContain("'2026-05-04T00:00:00Z'::timestamptz"); + expect(sql).not.toContain('NOW()'); + }); + + test('NowExpr.fixed with embedded single quote is doubled (injection defense)', () => { + const sql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: { 'daily/': { halflifeDays: 7, coefficient: 1.0 } }, + fallback: DEFAULT_FALLBACK, + now: { kind: 'fixed', isoUtc: "2026'; DROP TABLE pages;--" }, + }); + // The malicious quote must be doubled to ''. + expect(sql).toContain("''"); + expect(sql).not.toContain("DROP TABLE'"); + }); + + test('empty decayMap → only fallback ELSE branch', () => { + const sql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: {}, + fallback: { halflifeDays: 30, coefficient: 1.0 }, + }); + expect(sql).not.toContain('CASE'); + expect(sql).toContain('1 * 30.0 / (30.0 +'); + }); +}); + +describe('DEFAULT_RECENCY_DECAY composition', () => { + test('does not contain fork-specific names (no openclaw/, no wintermute/)', () => { + const keys = Object.keys(DEFAULT_RECENCY_DECAY); + for (const k of keys) { + expect(k.includes('openclaw')).toBe(false); + expect(k.includes('wintermute')).toBe(false); + } + }); + + test('concepts/ is evergreen (halflifeDays = 0)', () => { + expect(DEFAULT_RECENCY_DECAY['concepts/']?.halflifeDays).toBe(0); + }); + + test('daily/ has aggressive decay', () => { + expect(DEFAULT_RECENCY_DECAY['daily/']?.halflifeDays).toBeLessThan(30); + expect(DEFAULT_RECENCY_DECAY['daily/']?.coefficient).toBeGreaterThan(1); + }); +}); From 9e4072ecba81eb81f80e273c69ae39155e94532f Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 11:23:01 -0700 Subject: [PATCH 07/15] v0.29.1: refactor getRecentSalience to consume buildRecencyComponentSql MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both engines (Postgres + PGLite) now build the salience formula's third term via buildRecencyComponentSql instead of inlining 1.0 / (1 + days_old). Parameters: empty decayMap + fallback { halflifeDays: 1, coefficient: 1.0 }. Math expands to 1 * 1.0 / (1.0 + days_old) = 1 / (1 + days_old) — same numeric output as v0.29.0. This is a no-behavior-change refactor preparing for commit 7's recency_bias param. recency_bias='flat' (default) reproduces v0.29.0 exactly; 'on' swaps in DEFAULT_RECENCY_DECAY for per-prefix decay. Single source of truth for the recency math: same builder feeds the salience query AND (in commit 10) the post-fusion applyRecencyBoost stage. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/pglite-engine.ts | 13 +++++++++++-- src/core/postgres-engine.ts | 15 +++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index 6d031e371..0448d5873 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -34,7 +34,7 @@ import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult, ta import { GBrainError, PAGE_SORT_SQL } from './types.ts'; import { computeAnomaliesFromBuckets } from './cycle/anomaly.ts'; import { resolveBoostMap, resolveHardExcludes } from './search/source-boost.ts'; -import { buildSourceFactorCase, buildHardExcludeClause, buildVisibilityClause } from './search/sql-ranking.ts'; +import { buildSourceFactorCase, buildHardExcludeClause, buildVisibilityClause, buildRecencyComponentSql } from './search/sql-ranking.ts'; type PGLiteDB = PGlite; @@ -2241,13 +2241,22 @@ export class PGLiteEngine implements BrainEngine { params.push(limit); const limitParam = `$${params.length}`; + // v0.29.1: third score term via buildRecencyComponentSql with a "flat" + // decay map (halflife=1d, coefficient=1.0) — same numeric output as + // v0.29.0's inline `1.0 / (1 + days_old)`. Mirror of postgres-engine.ts. + const flatRecency = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: {}, + fallback: { halflifeDays: 1, coefficient: 1.0 }, + }); const { rows } = await this.db.query( `SELECT p.slug, p.source_id, p.title, p.type, p.updated_at, p.emotional_weight, COUNT(DISTINCT t.id) AS take_count, COALESCE(AVG(t.weight), 0) AS take_avg_weight, (p.emotional_weight * 5) + ln(1 + COUNT(DISTINCT t.id)) - + (1.0 / (1 + EXTRACT(EPOCH FROM (now() - p.updated_at)) / 86400)) + + ${flatRecency} AS score FROM pages p LEFT JOIN takes t ON t.page_id = p.id AND t.active = TRUE diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index e0661abc0..d407c3cc9 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -32,7 +32,7 @@ import { computeAnomaliesFromBuckets } from './cycle/anomaly.ts'; import * as db from './db.ts'; import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult, parseEmbedding, tryParseEmbedding, takeRowToTake } from './utils.ts'; import { resolveBoostMap, resolveHardExcludes } from './search/source-boost.ts'; -import { buildSourceFactorCase, buildHardExcludeClause, buildVisibilityClause } from './search/sql-ranking.ts'; +import { buildSourceFactorCase, buildHardExcludeClause, buildVisibilityClause, buildRecencyComponentSql } from './search/sql-ranking.ts'; // CONNECTION_ERROR_PATTERNS / isConnectionError were used by the per-call // executeRaw retry that #406 originally shipped. Eng-review D3 dropped that @@ -2326,13 +2326,24 @@ export class PostgresEngine implements BrainEngine { const prefixCondition = slugPrefix ? sql`AND p.slug LIKE ${slugPrefix.replace(/[\\%_]/g, (c) => '\\' + c) + '%'} ESCAPE '\\'` : sql``; + // v0.29.1: the third score term moves from inline `1.0 / (1 + days_old)` + // into buildRecencyComponentSql with a "flat" decay map (halflife=1d, + // coefficient=1.0; same numeric output as v0.29.0). Commit 7 adds an + // opt-in `recency_bias='on'` param that swaps this for the per-prefix + // map, but the default (this commit) preserves v0.29.0 behavior verbatim. + const flatRecency = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: {}, + fallback: { halflifeDays: 1, coefficient: 1.0 }, + }); const rows = await sql` SELECT p.slug, p.source_id, p.title, p.type, p.updated_at, p.emotional_weight, COUNT(DISTINCT t.id) AS take_count, COALESCE(AVG(t.weight), 0) AS take_avg_weight, (p.emotional_weight * 5) + ln(1 + COUNT(DISTINCT t.id)) - + (1.0 / (1 + EXTRACT(EPOCH FROM (now() - p.updated_at)) / 86400)) + + ${sql.unsafe(flatRecency)} AS score FROM pages p LEFT JOIN takes t ON t.page_id = p.id AND t.active = TRUE From a2e58993d14303fa592060d5b061e4e5e6746b29 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 11:24:38 -0700 Subject: [PATCH 08/15] v0.29.1: get_recent_salience gains recency_bias param (default 'flat') MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SalienceOpts.recency_bias: 'flat' | 'on' added; default 'flat' preserves v0.29.0 ranking verbatim. Pass 'on' to opt into per-prefix decay map (concepts/originals/writing/ evergreen; daily/, media/x/, chat/ aggressive decay). When recency_bias='on', the salience query reads COALESCE(p.effective_date, p.updated_at) instead of bare p.updated_at, so the recency component is immune to auto-link updated_at churn — old concepts/ pages just-touched by auto-link don't suddenly look fresh. Both engines (Postgres + PGLite) wire the param through. resolveRecencyDecayMap() honors gbrain.yml + GBRAIN_RECENCY_DECAY env at runtime. MCP op surface: get_recent_salience gains the param with a load-bearing description teaching the agent when to use 'on' vs 'flat' (current state → on; mattering across all time → flat). No silent v0.29.0 behavior change — opt-in only (per D11 codex resolution). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/operations.ts | 16 ++++++++++++++++ src/core/pglite-engine.ts | 31 +++++++++++++++++++++---------- src/core/postgres-engine.ts | 34 ++++++++++++++++++++++------------ src/core/types.ts | 9 +++++++++ 4 files changed, 68 insertions(+), 22 deletions(-) diff --git a/src/core/operations.ts b/src/core/operations.ts index 306915d49..8a020a5b7 100644 --- a/src/core/operations.ts +++ b/src/core/operations.ts @@ -1747,12 +1747,28 @@ const get_recent_salience: Operation = { type: 'string', description: "Optional slug-prefix filter, e.g. 'personal' or 'wiki/people'.", }, + recency_bias: { + type: 'string', + enum: ['flat', 'on'], + description: + "v0.29.1: how to weight recency in the salience score.\n" + + " 'flat' (DEFAULT) — v0.29.0 behavior. Every page gets 1/(1+days_old).\n" + + " Stable, predictable; what most callers want.\n" + + " 'on' — Per-prefix decay map. concepts/originals/writing/\n" + + " become evergreen (recency component = 0); daily/,\n" + + " media/x/, chat/ decay aggressively. Use when the\n" + + " user explicitly biases for recency-aware salience\n" + + " ('what's been salient lately' vs 'what matters\n" + + " in this brain regardless of when').", + }, }, handler: async (ctx, p) => { + const recencyBias = p.recency_bias === 'on' ? 'on' : 'flat'; return ctx.engine.getRecentSalience({ days: typeof p.days === 'number' ? p.days : undefined, limit: typeof p.limit === 'number' ? p.limit : undefined, slugPrefix: typeof p.slugPrefix === 'string' ? p.slugPrefix : undefined, + recency_bias: recencyBias, }); }, cliHints: { name: 'salience' }, diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index 0448d5873..b7a506762 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -2241,22 +2241,33 @@ export class PGLiteEngine implements BrainEngine { params.push(limit); const limitParam = `$${params.length}`; - // v0.29.1: third score term via buildRecencyComponentSql with a "flat" - // decay map (halflife=1d, coefficient=1.0) — same numeric output as - // v0.29.0's inline `1.0 / (1 + days_old)`. Mirror of postgres-engine.ts. - const flatRecency = buildRecencyComponentSql({ - slugColumn: 'p.slug', - dateExpr: 'p.updated_at', - decayMap: {}, - fallback: { halflifeDays: 1, coefficient: 1.0 }, - }); + // v0.29.1: third score term via buildRecencyComponentSql. Default + // 'flat' = v0.29.0 behavior. 'on' opts into per-prefix decay. + const recencyBias = opts.recency_bias ?? 'flat'; + let recencySql: string; + if (recencyBias === 'on') { + const { resolveRecencyDecayMap, DEFAULT_FALLBACK } = await import('./search/recency-decay.ts'); + recencySql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'COALESCE(p.effective_date, p.updated_at)', + decayMap: resolveRecencyDecayMap(), + fallback: DEFAULT_FALLBACK, + }); + } else { + recencySql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: {}, + fallback: { halflifeDays: 1, coefficient: 1.0 }, + }); + } const { rows } = await this.db.query( `SELECT p.slug, p.source_id, p.title, p.type, p.updated_at, p.emotional_weight, COUNT(DISTINCT t.id) AS take_count, COALESCE(AVG(t.weight), 0) AS take_avg_weight, (p.emotional_weight * 5) + ln(1 + COUNT(DISTINCT t.id)) - + ${flatRecency} + + ${recencySql} AS score FROM pages p LEFT JOIN takes t ON t.page_id = p.id AND t.active = TRUE diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index d407c3cc9..63d2899c8 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -2326,24 +2326,34 @@ export class PostgresEngine implements BrainEngine { const prefixCondition = slugPrefix ? sql`AND p.slug LIKE ${slugPrefix.replace(/[\\%_]/g, (c) => '\\' + c) + '%'} ESCAPE '\\'` : sql``; - // v0.29.1: the third score term moves from inline `1.0 / (1 + days_old)` - // into buildRecencyComponentSql with a "flat" decay map (halflife=1d, - // coefficient=1.0; same numeric output as v0.29.0). Commit 7 adds an - // opt-in `recency_bias='on'` param that swaps this for the per-prefix - // map, but the default (this commit) preserves v0.29.0 behavior verbatim. - const flatRecency = buildRecencyComponentSql({ - slugColumn: 'p.slug', - dateExpr: 'p.updated_at', - decayMap: {}, - fallback: { halflifeDays: 1, coefficient: 1.0 }, - }); + // v0.29.1: third score term via buildRecencyComponentSql. Default + // 'flat' = v0.29.0 behavior (1 / (1 + days_old)). 'on' opts into the + // per-prefix decay map (concepts/ evergreen, daily/ aggressive, etc.). + const recencyBias = opts.recency_bias ?? 'flat'; + let recencySql: string; + if (recencyBias === 'on') { + const { resolveRecencyDecayMap, DEFAULT_FALLBACK } = await import('./search/recency-decay.ts'); + recencySql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'COALESCE(p.effective_date, p.updated_at)', + decayMap: resolveRecencyDecayMap(), + fallback: DEFAULT_FALLBACK, + }); + } else { + recencySql = buildRecencyComponentSql({ + slugColumn: 'p.slug', + dateExpr: 'p.updated_at', + decayMap: {}, + fallback: { halflifeDays: 1, coefficient: 1.0 }, + }); + } const rows = await sql` SELECT p.slug, p.source_id, p.title, p.type, p.updated_at, p.emotional_weight, COUNT(DISTINCT t.id) AS take_count, COALESCE(AVG(t.weight), 0) AS take_avg_weight, (p.emotional_weight * 5) + ln(1 + COUNT(DISTINCT t.id)) - + ${sql.unsafe(flatRecency)} + + ${sql.unsafe(recencySql)} AS score FROM pages p LEFT JOIN takes t ON t.page_id = p.id AND t.active = TRUE diff --git a/src/core/types.ts b/src/core/types.ts index 801f78d60..372c8fa29 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -152,6 +152,15 @@ export interface SalienceOpts { limit?: number; /** Optional slug-prefix filter (e.g., `personal`, `wiki/people`). */ slugPrefix?: string; + /** + * v0.29.1 — recency-decay treatment for the salience formula's third term. + * - 'flat' (default): v0.29.0 behavior, `1.0 / (1 + days_old)` for every page + * - 'on': per-prefix decay from DEFAULT_RECENCY_DECAY (concepts/originals + * evergreen; daily/, media/x/ aggressive). Use when the agent wants + * "recency-biased salience" — what's been mattering AND fresh. + * Default preserves v0.29.0 ranking; 'on' is opt-in. + */ + recency_bias?: 'flat' | 'on'; } export interface SalienceResult { From 6132fbc3e05447dce665deea78a7dd0e87d081f4 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 11:26:02 -0700 Subject: [PATCH 09/15] v0.29.1: recompute_emotional_weight writes salience_touched_at; window picks up newly-salient pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit setEmotionalWeightBatch on both engines now bumps salience_touched_at to NOW() ONLY when the new emotional_weight differs from the existing one (IS DISTINCT FROM, NULL-safe). No-op writes (same weight) leave the column alone — preserves "actual change" semantics. getRecentSalience window changes from WHERE p.updated_at >= boundary to WHERE GREATEST(p.updated_at, COALESCE(p.salience_touched_at, p.updated_at)) >= boundary Closes codex pass-1 finding #4: pages whose emotional_weight just changed in the dream cycle (because tags or takes shifted) but whose updated_at is older than the salience window now correctly enter the recent-salience results. Without this, "Garry just added a take to a 6-month-old page" stayed invisible to get_recent_salience until the next content edit. COALESCE(salience_touched_at, p.updated_at) handles pre-v0.29.1 rows where salience_touched_at is NULL — they fall back to p.updated_at and behave identically to v0.29.0. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/pglite-engine.ts | 11 +++++++++-- src/core/postgres-engine.ts | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index b7a506762..d04252dd3 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -2213,9 +2213,16 @@ export class PGLiteEngine implements BrainEngine { const sourceIds = rows.map(r => r.source_id); const weights = rows.map(r => r.weight); // Composite-keyed UPDATE FROM unnest (codex C4#3). + // v0.29.1: bump salience_touched_at when emotional_weight actually changes + // so the salience query window picks up newly-salient old pages. Mirror + // of postgres-engine.ts. const result = await this.db.query( `UPDATE pages - SET emotional_weight = u.weight + SET emotional_weight = u.weight, + salience_touched_at = CASE + WHEN pages.emotional_weight IS DISTINCT FROM u.weight THEN now() + ELSE pages.salience_touched_at + END FROM unnest($1::text[], $2::text[], $3::real[]) AS u(slug, source_id, weight) WHERE pages.slug = u.slug AND pages.source_id = u.source_id @@ -2271,7 +2278,7 @@ export class PGLiteEngine implements BrainEngine { AS score FROM pages p LEFT JOIN takes t ON t.page_id = p.id AND t.active = TRUE - WHERE p.updated_at >= $1::timestamptz + WHERE GREATEST(p.updated_at, COALESCE(p.salience_touched_at, p.updated_at)) >= $1::timestamptz ${prefixCondition} GROUP BY p.id ORDER BY score DESC diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index 63d2899c8..4ababe8cd 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -2304,9 +2304,20 @@ export class PostgresEngine implements BrainEngine { const weights = rows.map(r => r.weight); // Composite-keyed UPDATE FROM unnest (codex C4#3): pages.slug is unique // only within a source, so a slug-only join would fan out across sources. + // + // v0.29.1: bump salience_touched_at to NOW() ONLY when emotional_weight + // actually changes. The salience query window then includes the page in + // GREATEST(updated_at, salience_touched_at) >= boundary, so a previously + // calm page that just became salient surfaces in the recent salience + // results without a content edit. No-op writes (same weight) leave + // salience_touched_at alone — preserves "actual change" semantics. const result = await sql` UPDATE pages - SET emotional_weight = u.weight + SET emotional_weight = u.weight, + salience_touched_at = CASE + WHEN pages.emotional_weight IS DISTINCT FROM u.weight THEN now() + ELSE pages.salience_touched_at + END FROM unnest(${slugs}::text[], ${sourceIds}::text[], ${weights}::real[]) AS u(slug, source_id, weight) WHERE pages.slug = u.slug AND pages.source_id = u.source_id @@ -2357,7 +2368,7 @@ export class PostgresEngine implements BrainEngine { AS score FROM pages p LEFT JOIN takes t ON t.page_id = p.id AND t.active = TRUE - WHERE p.updated_at >= ${boundaryIso}::timestamptz + WHERE GREATEST(p.updated_at, COALESCE(p.salience_touched_at, p.updated_at)) >= ${boundaryIso}::timestamptz ${prefixCondition} GROUP BY p.id ORDER BY score DESC From af6c7682810ec711557614071431c12d4c449ad3 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 12:01:21 -0700 Subject: [PATCH 10/15] =?UTF-8?q?v0.29.1:=20merge=20intent.ts=20=E2=86=92?= =?UTF-8?q?=20query-intent.ts;=20emit=203=20suggestions=20per=20query?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit D1 + D4 + D6 + D8: single regex-pass classifier returning {intent, suggestedDetail, suggestedSalience, suggestedRecency}. intent + suggestedDetail are v0.29.0 behavior verbatim (legacy intent.ts deleted; classifyQueryIntent + autoDetectDetail compat shims preserved). NEW for v0.29.1 — two orthogonal recency-axis suggestions: suggestedSalience: 'off' | 'on' | 'strong' suggestedRecency: 'off' | 'on' | 'strong' Resolution rules (per D6 narrow temporal-bound exception): - CANONICAL patterns (who is X / what is Y / code / graph) → both off - UNLESS an EXPLICIT_TEMPORAL_BOUND also matches (today / right now / this week / since X / last N days), in which case temporal-bound wins - STRONG_RECENCY (today / right now / this morning / just now) → strong - RECENCY_ON (latest / recent / this week / meeting prep / catch up / remind me / status update) → on - SALIENCE_ON (catch up / remind me / status update / prep me / what's going on / what matters) → on - default → off for both axes (v0.29.1 prime-directive: pure opt-in) Salience and recency are TRULY orthogonal (per D9). A query like "latest news on AI" → recency='on' but salience='off' (the user wants fresh, not emotionally-weighted). "What's going on with widget-co" → both on. "Who is X right now" → both 'strong'/'on' (temporal bound beats canonical 'who is'). intent.ts deleted; test/intent.test.ts renamed → test/query-intent-legacy.test.ts (unchanged behavior coverage). New test/query-intent.test.ts adds 21 cases covering all three axes' interactions: canonical wins on bare 'who is', temporal bound overrides, "catch me up" matches with up to 15 chars between, "today" → strong, intent vs recency independence. Updated callers: - src/core/search/hybrid.ts (autoDetectDetail import) - test/recency-boost.test.ts (classifyQueryIntent import) - test/benchmark-search-quality.ts (autoDetectDetail import) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/search/hybrid.ts | 2 +- src/core/search/intent.ts | 108 -------- src/core/search/query-intent.ts | 253 ++++++++++++++++++ test/benchmark-search-quality.ts | 2 +- ...nt.test.ts => query-intent-legacy.test.ts} | 2 +- test/query-intent.test.ts | 157 +++++++++++ test/recency-boost.test.ts | 2 +- 7 files changed, 414 insertions(+), 112 deletions(-) delete mode 100644 src/core/search/intent.ts create mode 100644 src/core/search/query-intent.ts rename test/{intent.test.ts => query-intent-legacy.test.ts} (99%) create mode 100644 test/query-intent.test.ts diff --git a/src/core/search/hybrid.ts b/src/core/search/hybrid.ts index 174fc68ae..12f52b95c 100644 --- a/src/core/search/hybrid.ts +++ b/src/core/search/hybrid.ts @@ -14,7 +14,7 @@ import { MAX_SEARCH_LIMIT, clampSearchLimit } from '../engine.ts'; import type { SearchResult, SearchOpts, HybridSearchMeta } from '../types.ts'; import { embed } from '../embedding.ts'; import { dedupResults } from './dedup.ts'; -import { autoDetectDetail } from './intent.ts'; +import { autoDetectDetail } from './query-intent.ts'; import { expandAnchors, hydrateChunks } from './two-pass.ts'; import { applyRecencyBoost } from './recency.ts'; diff --git a/src/core/search/intent.ts b/src/core/search/intent.ts deleted file mode 100644 index e28e8f197..000000000 --- a/src/core/search/intent.ts +++ /dev/null @@ -1,108 +0,0 @@ -/** - * Query Intent Classifier - * - * Zero-latency heuristic classifier that detects query intent from text patterns. - * Maps intent to the appropriate detail level for hybrid search. - * - * No LLM call, no API cost, no latency. Pattern matching on query text. - */ - -export type QueryIntent = 'entity' | 'temporal' | 'event' | 'general'; - -// Temporal patterns: questions about when things happened, meeting history -const TEMPORAL_PATTERNS = [ - /\bwhen\b/i, - /\blast\s+(met|meeting|call|conversation|chat|talked|spoke|seen|heard|time)\b/i, - /\brecent(ly)?\b/i, - /\bhistory\b/i, - /\btimeline\b/i, - /\bmeeting\s+notes?\b/i, - /\bwhat('s| is| was)\s+new\b/i, - /\blatest\b/i, - /\bupdate(s)?\s+(on|from|about)\b/i, - /\bhow\s+long\s+(ago|since)\b/i, - /\b\d{4}[-/]\d{2}\b/i, // date pattern like 2024-03 - /\blast\s+(week|month|quarter|year)\b/i, -]; - -// Event patterns: specific events, announcements, launches -const EVENT_PATTERNS = [ - /\bannounce[ds]?(ment)?\b/i, - /\blaunch(ed|es|ing)?\b/i, - /\braised?\s+\$?\d/i, - /\bfund(ing|raise)\b/i, - /\bIPO\b/i, - /\bacquisition\b/i, - /\bmerge[drs]?\b/i, - /\bnews\b/i, - /\bhappened?\b/i, -]; - -// Entity patterns: identity questions, overviews -const ENTITY_PATTERNS = [ - /\bwho\s+is\b/i, - /\bwhat\s+(is|does|are)\b/i, - /\btell\s+me\s+about\b/i, - /\bdescribe\b/i, - /\bsummar(y|ize)\b/i, - /\boverview\b/i, - /\bbackground\b/i, - /\bprofile\b/i, - /\bwhat\s+do\s+(you|we)\s+know\b/i, -]; - -// Full-context patterns: requests for everything -const FULL_CONTEXT_PATTERNS = [ - /\beverything\b/i, - /\ball\s+(about|info|information|details)\b/i, - /\bfull\s+(history|context|picture|story|details)\b/i, - /\bcomprehensive\b/i, - /\bdeep\s+dive\b/i, - /\bgive\s+me\s+everything\b/i, -]; - -/** - * Classify query intent from text patterns. - * Returns the detected intent type. - */ -export function classifyQueryIntent(query: string): QueryIntent { - // Full context requests → treat as temporal (return everything) - if (FULL_CONTEXT_PATTERNS.some(p => p.test(query))) return 'temporal'; - - // Check temporal patterns first (highest priority for detail=high) - if (TEMPORAL_PATTERNS.some(p => p.test(query))) return 'temporal'; - - // Check event patterns - if (EVENT_PATTERNS.some(p => p.test(query))) return 'event'; - - // Check entity patterns - if (ENTITY_PATTERNS.some(p => p.test(query))) return 'entity'; - - // Default: general query - return 'general'; -} - -/** - * Map query intent to detail level. - * - * entity → 'low' (compiled truth only, user wants the assessment) - * temporal → 'high' (need timeline, user wants dates/events) - * event → 'high' (need timeline, user wants specific events) - * general → undefined (use default medium, let the boost handle it) - */ -export function intentToDetail(intent: QueryIntent): 'low' | 'medium' | 'high' | undefined { - switch (intent) { - case 'entity': return 'low'; - case 'temporal': return 'high'; - case 'event': return 'high'; - case 'general': return undefined; // use default - } -} - -/** - * Auto-detect detail level from query text. - * Returns undefined if no strong signal detected (uses default). - */ -export function autoDetectDetail(query: string): 'low' | 'medium' | 'high' | undefined { - return intentToDetail(classifyQueryIntent(query)); -} diff --git a/src/core/search/query-intent.ts b/src/core/search/query-intent.ts new file mode 100644 index 000000000..66762afa1 --- /dev/null +++ b/src/core/search/query-intent.ts @@ -0,0 +1,253 @@ +/** + * v0.29.1 — merged query-intent classifier. + * + * Replaces v0.29.0's `intent.ts` (which only emitted a detail suggestion). + * After D1 + D4 the codebase needs ONE classifier that returns three + * suggestions from a single regex pass: + * + * - intent: original v0.29.0 type ('entity' | 'temporal' | 'event' | 'general') + * - suggestedDetail: v0.29.0 mapping (entity→low, temporal/event→high) + * - suggestedSalience: NEW for v0.29.1 — 'off' | 'on' | 'strong' + * - suggestedRecency: NEW for v0.29.1 — 'off' | 'on' | 'strong' + * + * Salience and recency are TRULY ORTHOGONAL (per D9): + * - salience boosts pages with high emotional_weight + take_count (mattering) + * - recency boosts pages with recent effective_date (per-prefix decay) + * Both can fire, neither can fire, or just one. + * + * The classifier follows "current state → on. canonical truth → off." with + * a NARROW exception per D6: explicit temporal bounds (today / this week / + * right now / since X / last N days) override canonical-pattern wins. So + * "who is X right now" → suggestedRecency='on' even though "who is" is a + * canonical pattern. + * + * Pure module. No DB, no LLM, no async. Tested in test/query-intent.test.ts. + */ + +export type QueryIntent = 'entity' | 'temporal' | 'event' | 'general'; + +export type SalienceMode = 'off' | 'on' | 'strong'; +export type RecencyMode = 'off' | 'on' | 'strong'; + +export interface QuerySuggestions { + intent: QueryIntent; + /** v0.29.0 detail mapping. entity→low, temporal/event→high, general→undefined. */ + suggestedDetail: 'low' | 'medium' | 'high' | undefined; + /** v0.29.1 — emotional_weight + take_count boost. */ + suggestedSalience: SalienceMode; + /** v0.29.1 — per-prefix age-decay boost. */ + suggestedRecency: RecencyMode; +} + +// ───────────────────────────────────────────────────────── +// Pattern banks (organized by axis they signal) +// ───────────────────────────────────────────────────────── + +// Original v0.29.0 intent patterns. Drive .intent + .suggestedDetail. +const TEMPORAL_PATTERNS = [ + /\bwhen\b/i, + /\blast\s+(met|meeting|call|conversation|chat|talked|spoke|seen|heard|time)\b/i, + /\brecent(ly)?\b/i, + /\bhistory\b/i, + /\btimeline\b/i, + /\bmeeting\s+notes?\b/i, + /\bwhat('s| is| was)\s+new\b/i, + /\blatest\b/i, + /\bupdate(s)?\s+(on|from|about)\b/i, + /\bhow\s+long\s+(ago|since)\b/i, + /\b\d{4}[-/]\d{2}\b/i, + /\blast\s+(week|month|quarter|year)\b/i, +]; + +const EVENT_PATTERNS = [ + /\bannounce[ds]?(ment)?\b/i, + /\blaunch(ed|es|ing)?\b/i, + /\braised?\s+\$?\d/i, + /\bfund(ing|raise)\b/i, + /\bIPO\b/i, + /\bacquisition\b/i, + /\bmerge[drs]?\b/i, + /\bnews\b/i, + /\bhappened?\b/i, +]; + +const ENTITY_PATTERNS = [ + /\bwho\s+is\b/i, + /\bwhat\s+(is|does|are)\b/i, + /\btell\s+me\s+about\b/i, + /\bdescribe\b/i, + /\bsummar(y|ize)\b/i, + /\boverview\b/i, + /\bbackground\b/i, + /\bprofile\b/i, + /\bwhat\s+do\s+(you|we)\s+know\b/i, +]; + +const FULL_CONTEXT_PATTERNS = [ + /\beverything\b/i, + /\ball\s+(about|info|information|details)\b/i, + /\bfull\s+(history|context|picture|story|details)\b/i, + /\bcomprehensive\b/i, + /\bdeep\s+dive\b/i, + /\bgive\s+me\s+everything\b/i, +]; + +// v0.29.1 — recency-axis patterns +// +// Canonical patterns: queries asking for the authoritative / definitional +// answer. These signal recency='off' even when other axes match — UNLESS +// an explicit temporal bound is present (per D6 narrow exception). +const CANONICAL_PATTERNS = [ + /\bwho\s+is\b/i, + /\bwhat\s+(is|are|does|means?)\b/i, + /\bdefin(e|ition|ing)\b/i, + /\bexplain\s+(what|how|why)\b/i, + /\b(history|origin|background)\s+of\b/i, + /\bconcept\s+of\b/i, + /\boverview\s+of\b/i, + /\btell\s+me\s+about\b/i, + /\bcompiled\s+truth\b/i, + /::|->|\.\w+\(/, + /\b(function|class|method|module)\s+\w+/i, + /\b(graph|traversal|backlinks?|inbound|outbound)\b/i, +]; + +// Aggressive recency: "today", "right now", "this morning", "just now". +const STRONG_RECENCY_PATTERNS = [ + /\btoday\b/i, + /\bright\s+now\b/i, + /\bthis\s+morning\b/i, + /\bjust\s+now\b/i, +]; + +// Moderate recency: "what's going on", "latest", "recent", "this week", +// meeting prep, conversation recall, status updates. +const RECENCY_ON_PATTERNS = [ + /\bwhat'?s\s+(going\s+on|happening|new|latest|up)\b/i, + /\b(latest|recent(ly)?|currently)\b/i, + /\b(this|last|past)\s+(week|month|few\s+days|couple\s+days)\b/i, + /\bmeeting\s+(prep|with|for|notes?|brief)\b/i, + /\bbefore\s+(my|the|our)\s+(meeting|call|sync|chat)\b/i, + /\bprep(are)?\s+(for|me)\b/i, + /\bcatch(es|ing)?\b[\s\w]{0,15}\bup\b/i, // "catch up", "catch me up", "catching X up" + /\bremind\s+me\s+(what|about|of)\b/i, + /\b(update|status|progress)\s+(on|with|from)\b/i, +]; + +// Per D6: explicit temporal bounds override canonical-wins. "Who is X today" +// → recency='on' (temporal bound wins). "Who is X" alone → recency='off'. +const EXPLICIT_TEMPORAL_BOUND_PATTERNS = [ + /\btoday\b/i, + /\bright\s+now\b/i, + /\bthis\s+morning\b/i, + /\bthis\s+week\b/i, + /\bsince\s+(launch|last|the|\d)/i, + /\blast\s+\d+\s+(day|days|week|weeks|month|months)\b/i, +]; + +// v0.29.1 — salience-axis patterns +// +// Salience suggests "what matters in this brain right now" — when the user +// is asking about people/companies/deals in the current context, they +// usually want the emotionally-weighted + take-rich pages to surface. +// Salience patterns are a subset of recency-on patterns (meeting prep, +// catch-up, update language) plus people-centric phrasings. +const SALIENCE_ON_PATTERNS = [ + /\bwhat'?s\s+(going\s+on|happening|been\s+going|been\s+up)\b/i, + /\bcatch(es|ing)?\b[\s\w]{0,15}\bup\b/i, + /\bremind\s+me\s+(what|about|of)\b/i, + /\bprep(are)?\s+(for|me)\b/i, + /\bbefore\s+(my|the|our)\s+(meeting|call|sync|chat)\b/i, + /\bmeeting\s+(prep|with|for|brief)\b/i, + /\b(update|status|progress)\s+(on|with|from)\b/i, + /\bwhat\s+matters\b/i, + /\bwhat'?s\s+important\b/i, +]; + +// ───────────────────────────────────────────────────────── +// Classifier +// ───────────────────────────────────────────────────────── + +function matches(patterns: RegExp[], q: string): boolean { + for (const re of patterns) if (re.test(q)) return true; + return false; +} + +/** + * Classify a query and return all three axis suggestions. + * + * Resolution rules: + * - intent: original v0.29.0 priority (full-context > temporal > event > entity > general) + * - suggestedDetail: intent → detail mapping (entity=low, temporal/event=high) + * - suggestedRecency: STRONG_RECENCY > RECENCY_ON; CANONICAL wins UNLESS + * EXPLICIT_TEMPORAL_BOUND also matches; default 'off' + * - suggestedSalience: SALIENCE_ON; CANONICAL wins UNLESS + * EXPLICIT_TEMPORAL_BOUND; default 'off' + * + * Note: salience and recency are independent. A "what's going on with X" + * query gets BOTH on; "who is X" gets BOTH off; "today's news" gets + * recency='strong' but salience='off' (the user wants newest, not + * emotionally-weighted). + */ +export function classifyQuery(query: string): QuerySuggestions { + const intent = classifyQueryIntent(query); + const suggestedDetail = intentToDetail(intent); + + const hasCanonical = matches(CANONICAL_PATTERNS, query); + const hasTemporalBound = matches(EXPLICIT_TEMPORAL_BOUND_PATTERNS, query); + const hasStrongRecency = matches(STRONG_RECENCY_PATTERNS, query); + const hasRecencyOn = matches(RECENCY_ON_PATTERNS, query); + const hasSalienceOn = matches(SALIENCE_ON_PATTERNS, query); + + // Recency axis + let suggestedRecency: RecencyMode; + if (hasCanonical && !hasTemporalBound) { + suggestedRecency = 'off'; + } else if (hasStrongRecency) { + suggestedRecency = 'strong'; + } else if (hasRecencyOn) { + suggestedRecency = 'on'; + } else { + suggestedRecency = 'off'; + } + + // Salience axis (orthogonal) + let suggestedSalience: SalienceMode; + if (hasCanonical && !hasTemporalBound) { + suggestedSalience = 'off'; + } else if (hasSalienceOn) { + suggestedSalience = 'on'; + } else { + suggestedSalience = 'off'; + } + + return { intent, suggestedDetail, suggestedSalience, suggestedRecency }; +} + +// ───────────────────────────────────────────────────────── +// v0.29.0 compatibility shims +// ───────────────────────────────────────────────────────── + +/** v0.29.0 intent type. Preserved verbatim for back-compat. */ +export function classifyQueryIntent(query: string): QueryIntent { + if (matches(FULL_CONTEXT_PATTERNS, query)) return 'temporal'; + if (matches(TEMPORAL_PATTERNS, query)) return 'temporal'; + if (matches(EVENT_PATTERNS, query)) return 'event'; + if (matches(ENTITY_PATTERNS, query)) return 'entity'; + return 'general'; +} + +/** v0.29.0 mapping. */ +export function intentToDetail(intent: QueryIntent): 'low' | 'medium' | 'high' | undefined { + switch (intent) { + case 'entity': return 'low'; + case 'temporal': return 'high'; + case 'event': return 'high'; + case 'general': return undefined; + } +} + +/** v0.29.0 helper. Routes through classifyQuery internally. */ +export function autoDetectDetail(query: string): 'low' | 'medium' | 'high' | undefined { + return classifyQuery(query).suggestedDetail; +} diff --git a/test/benchmark-search-quality.ts b/test/benchmark-search-quality.ts index 347a7d0a2..ccd90fba6 100644 --- a/test/benchmark-search-quality.ts +++ b/test/benchmark-search-quality.ts @@ -13,7 +13,7 @@ import { PGLiteEngine } from '../src/core/pglite-engine.ts'; import { rrfFusion } from '../src/core/search/hybrid.ts'; import { dedupResults } from '../src/core/search/dedup.ts'; import { precisionAtK, recallAtK, mrr, ndcgAtK } from '../src/core/search/eval.ts'; -import { autoDetectDetail } from '../src/core/search/intent.ts'; +import { autoDetectDetail } from '../src/core/search/query-intent.ts'; import type { SearchResult, ChunkInput } from '../src/core/types.ts'; const RRF_K = 60; diff --git a/test/intent.test.ts b/test/query-intent-legacy.test.ts similarity index 99% rename from test/intent.test.ts rename to test/query-intent-legacy.test.ts index c092c1828..971f311f6 100644 --- a/test/intent.test.ts +++ b/test/query-intent-legacy.test.ts @@ -3,7 +3,7 @@ */ import { describe, test, expect } from 'bun:test'; -import { classifyQueryIntent, autoDetectDetail } from '../src/core/search/intent.ts'; +import { classifyQueryIntent, autoDetectDetail } from '../src/core/search/query-intent.ts'; describe('classifyQueryIntent', () => { describe('entity queries', () => { diff --git a/test/query-intent.test.ts b/test/query-intent.test.ts new file mode 100644 index 000000000..a288cd585 --- /dev/null +++ b/test/query-intent.test.ts @@ -0,0 +1,157 @@ +/** + * v0.29.1 — merged query-intent classifier tests. + * + * Covers the new classifyQuery(query) returning {intent, suggestedDetail, + * suggestedSalience, suggestedRecency}. Legacy intent.ts behavior is + * preserved in test/query-intent-legacy.test.ts (which imports the + * classifyQueryIntent + autoDetectDetail compat shims). + * + * Pure regex; no DB. + */ + +import { describe, test, expect } from 'bun:test'; +import { classifyQuery } from '../src/core/search/query-intent.ts'; + +describe('classifyQuery — entity / canonical queries → both axes off', () => { + test('"who is widget-ceo" → recency=off, salience=off', () => { + const r = classifyQuery('who is widget-ceo'); + expect(r.intent).toBe('entity'); + expect(r.suggestedRecency).toBe('off'); + expect(r.suggestedSalience).toBe('off'); + expect(r.suggestedDetail).toBe('low'); + }); + + test('"what is recursion" → both off', () => { + const r = classifyQuery('what is recursion'); + expect(r.suggestedRecency).toBe('off'); + expect(r.suggestedSalience).toBe('off'); + }); + + test('"tell me about widget-co" → both off', () => { + const r = classifyQuery('tell me about widget-co'); + expect(r.suggestedRecency).toBe('off'); + expect(r.suggestedSalience).toBe('off'); + }); + + test('"history of X" → both off (canonical)', () => { + const r = classifyQuery('history of acme corp'); + expect(r.suggestedRecency).toBe('off'); + expect(r.suggestedSalience).toBe('off'); + }); + + test('code lookup syntax → both off', () => { + const r = classifyQuery('Foo::bar() returns null'); + expect(r.suggestedRecency).toBe('off'); + expect(r.suggestedSalience).toBe('off'); + }); + + test('graph traversal language → both off', () => { + const r = classifyQuery('show me backlinks to widget-co'); + expect(r.suggestedRecency).toBe('off'); + expect(r.suggestedSalience).toBe('off'); + }); +}); + +describe('classifyQuery — current-state queries → both axes on', () => { + test('"what\'s going on with widget-co" → both on', () => { + const r = classifyQuery("what's going on with widget-co"); + expect(r.suggestedRecency).toBe('on'); + expect(r.suggestedSalience).toBe('on'); + }); + + test('"catch me up on acme" → both on', () => { + const r = classifyQuery('catch me up on acme'); + expect(r.suggestedRecency).toBe('on'); + expect(r.suggestedSalience).toBe('on'); + }); + + test('"prep me for the widget-ceo meeting" → both on', () => { + const r = classifyQuery('prep me for the widget-ceo meeting'); + expect(r.suggestedRecency).toBe('on'); + expect(r.suggestedSalience).toBe('on'); + }); + + test('"before my meeting with X" → both on', () => { + const r = classifyQuery('before my meeting with widget-ceo'); + expect(r.suggestedRecency).toBe('on'); + expect(r.suggestedSalience).toBe('on'); + }); + + test('"remind me about acme" → both on', () => { + const r = classifyQuery('remind me about acme'); + expect(r.suggestedRecency).toBe('on'); + expect(r.suggestedSalience).toBe('on'); + }); +}); + +describe('classifyQuery — recency-only patterns (no salience signal)', () => { + test('"latest news on AI" → recency=on, salience=off', () => { + const r = classifyQuery('latest news on AI'); + expect(r.suggestedRecency).toBe('on'); + expect(r.suggestedSalience).toBe('off'); + }); + + test('"this week\'s updates" → recency=on, salience=off', () => { + const r = classifyQuery("this week's updates"); + expect(r.suggestedRecency).toBe('on'); + // "updates" + "on/with/from" pattern needed for salience + expect(r.suggestedSalience).toBe('off'); + }); +}); + +describe('classifyQuery — strong recency ("today" / "right now")', () => { + test('"what happened today" → recency=strong', () => { + const r = classifyQuery('what happened today'); + expect(r.suggestedRecency).toBe('strong'); + }); + + test('"right now what is the status" → strong', () => { + const r = classifyQuery('right now what is the status of the deal'); + // "what is" canonical fires; but "right now" is a temporal bound + expect(r.suggestedRecency).toBe('strong'); + }); +}); + +describe('classifyQuery — D6 narrow temporal-bound exception', () => { + test('"who is widget-ceo right now" → recency=strong (temporal bound wins)', () => { + const r = classifyQuery('who is widget-ceo right now'); + expect(r.suggestedRecency).toBe('strong'); + }); + + test('"who is widget-ceo today" → recency=strong', () => { + const r = classifyQuery('who is widget-ceo today'); + expect(r.suggestedRecency).toBe('strong'); + }); + + test('"who is widget-ceo" (no bound) → recency=off (canonical wins)', () => { + const r = classifyQuery('who is widget-ceo'); + expect(r.suggestedRecency).toBe('off'); + }); + + test('"what is widget-co\'s status this week" → recency=on (temporal bound wins)', () => { + const r = classifyQuery("what is widget-co's status this week"); + expect(r.suggestedRecency).toBe('on'); + }); +}); + +describe('classifyQuery — orthogonality of axes', () => { + test('default plain query → both off', () => { + const r = classifyQuery('the quick brown fox'); + expect(r.suggestedRecency).toBe('off'); + expect(r.suggestedSalience).toBe('off'); + expect(r.intent).toBe('general'); + expect(r.suggestedDetail).toBeUndefined(); + }); + + test('intent vs recency are independent axes', () => { + // "when did widget-co IPO": both 'when' (temporal) and 'IPO' (event) + // match v0.29.0 patterns. classifyQueryIntent's priority is + // temporal > event so .intent = 'temporal'. But recency depends on + // CANONICAL/RECENCY_ON patterns, not on .intent — neither set + // matches here, so suggestedRecency = 'off'. + const r = classifyQuery('when did widget-co IPO'); + expect(r.intent).toBe('temporal'); + expect(r.suggestedRecency).toBe('off'); + expect(r.suggestedSalience).toBe('off'); + }); +}); diff --git a/test/recency-boost.test.ts b/test/recency-boost.test.ts index 518e452c9..dfebddaeb 100644 --- a/test/recency-boost.test.ts +++ b/test/recency-boost.test.ts @@ -109,7 +109,7 @@ describe('applyRecencyBoost', () => { }); // Intent detection tests (recency is auto-triggered by temporal intent) -import { classifyQueryIntent } from '../src/core/search/intent.ts'; +import { classifyQueryIntent } from '../src/core/search/query-intent.ts'; describe('intent classification → recency triggering', () => { it('"what\'s new with Ollama" → temporal (triggers recency)', () => { From b5048a72408a6437f00b86d2f6ef22fa7d26f441 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 12:12:09 -0700 Subject: [PATCH 11/15] v0.29.1: applySalienceBoost + applyRecencyBoost + runPostFusionStages wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit D9 + codex pass-1 #2 + #3 + pass-2 #4: salience and recency are TRULY ORTHOGONAL post-fusion stages, both running from ALL THREE hybridSearch return paths (keyword-only, embed-failure-fallback, full-hybrid). NEW src/core/search/hybrid.ts exports: - applySalienceBoost(results, scores, strength) score *= 1 + k * log(1 + score) where k = 0.15 (on) or 0.30 (strong) No time component. Pure mattering signal. - applyRecencyBoost(results, dates, strength, decayMap, fallback, nowMs?) Per-prefix decay factor: 1 + strengthMul * coefficient * halflife / (halflife + days_old) strengthMul: 1.0 (on) or 1.5 (strong) Evergreen prefixes (halflifeDays=0) skipped (factor 1.0). Pure recency signal. Independent of mattering. - runPostFusionStages(engine, results, opts) Wraps backlink + salience + recency. Called from EACH return path so keyless installs and embed failures get the same boost surface as the full hybrid path. NEW engine methods (composite-keyed for multi-source isolation): - getEffectiveDates(refs: Array<{slug, source_id}>): Map Returns COALESCE(effective_date, updated_at, created_at). Key format: `${source_id}::${slug}`. Mirror of getBacklinkCounts shape. - getSalienceScores(refs: Array<{slug, source_id}>): Map Returns emotional_weight × 5 + ln(1 + take_count). Composite key. Deprecated (kept for back-compat through v0.29.x): - SearchOpts.afterDate / beforeDate (alias for since/until) - SearchOpts.recencyBoost: 0|1|2 (alias for recency: 'off'|'on'|'strong') - getPageTimestamps (use getEffectiveDates instead) NEW SearchOpts fields: - salience: 'off' | 'on' | 'strong' - recency: 'off' | 'on' | 'strong' - since: string (ISO-8601 or relative, replaces afterDate) - until: string (replaces beforeDate) Resolution: caller-explicit > legacy alias (recencyBoost) > heuristic (classifyQuery's suggestedSalience / suggestedRecency). Deleted: src/core/search/recency.ts (PR #618's, replaced) + test/recency-boost.test.ts (its scope is replaced by query-intent.test.ts + future post-fusion tests). Co-Authored-By: Claude Opus 4.7 (1M context) Co-Authored-By: Wintermute --- src/core/engine.ts | 23 ++++ src/core/pglite-engine.ts | 42 +++++++ src/core/postgres-engine.ts | 50 +++++++++ src/core/search/hybrid.ts | 218 +++++++++++++++++++++++++++++------- src/core/search/recency.ts | 68 ----------- src/core/types.ts | 35 +++++- test/recency-boost.test.ts | 138 ----------------------- 7 files changed, 326 insertions(+), 248 deletions(-) delete mode 100644 src/core/search/recency.ts delete mode 100644 test/recency-boost.test.ts diff --git a/src/core/engine.ts b/src/core/engine.ts index 7f11c6dc1..980d07cc4 100644 --- a/src/core/engine.ts +++ b/src/core/engine.ts @@ -381,8 +381,31 @@ export interface BrainEngine { * v0.27.0: for a list of slugs, return their updated_at timestamps (or created_at fallback). * Used by hybrid search recency boost. Single SQL query, not N+1. * Slugs with no timestamp get no entry in the map. + * + * @deprecated v0.29.1: prefer getEffectiveDates (composite-keyed, multi-source-safe). + * Kept for back-compat with PR #618 callers. */ getPageTimestamps(slugs: string[]): Promise>; + /** + * v0.29.1: for a list of (slug, source_id) refs, return COALESCE(effective_date, + * updated_at) per ref. Single SQL query. Composite-keyed map (key format: + * `${source_id}::${slug}`) so multi-source brains don't conflate pages with + * the same slug across sources (codex pass-1 finding #3). + * + * Drives the new applyRecencyBoost post-fusion stage. Returns NULL for refs + * with no row; map omits them. + */ + getEffectiveDates(refs: Array<{slug: string; source_id: string}>): Promise>; + /** + * v0.29.1: for a list of (slug, source_id) refs, return the salience score + * (emotional_weight × 5 + ln(1 + take_count)) per ref. Single SQL query. + * Composite-keyed (`${source_id}::${slug}`) like getEffectiveDates. + * + * Drives the new applySalienceBoost post-fusion stage. Pages with no row + * (or zero emotional_weight + zero takes) get score = 0; the boost stage + * skips them. + */ + getSalienceScores(refs: Array<{slug: string; source_id: string}>): Promise>; /** * Return every page with no inbound links (from any source). * Domain comes from the frontmatter `domain` field (null if unset). diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index d04252dd3..561343728 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -1226,6 +1226,48 @@ export class PGLiteEngine implements BrainEngine { return new Map(rows.map((r: any) => [r.slug as string, new Date(r.ts as string)])); } + async getEffectiveDates(refs: Array<{slug: string; source_id: string}>): Promise> { + if (refs.length === 0) return new Map(); + const slugs = refs.map(r => r.slug); + const sourceIds = refs.map(r => r.source_id); + const { rows } = await this.db.query( + `SELECT p.slug, p.source_id, COALESCE(p.effective_date, p.updated_at, p.created_at) AS ts + FROM pages p + JOIN unnest($1::text[], $2::text[]) AS u(slug, source_id) + ON p.slug = u.slug AND p.source_id = u.source_id`, + [slugs, sourceIds], + ); + const out = new Map(); + for (const r of rows as Array<{slug: string; source_id: string; ts: string | Date}>) { + const key = `${r.source_id}::${r.slug}`; + out.set(key, r.ts instanceof Date ? r.ts : new Date(r.ts)); + } + return out; + } + + async getSalienceScores(refs: Array<{slug: string; source_id: string}>): Promise> { + if (refs.length === 0) return new Map(); + const slugs = refs.map(r => r.slug); + const sourceIds = refs.map(r => r.source_id); + const { rows } = await this.db.query( + `SELECT p.slug, p.source_id, + (COALESCE(p.emotional_weight, 0) * 5 + + ln(1 + COUNT(DISTINCT t.id))) AS score + FROM pages p + JOIN unnest($1::text[], $2::text[]) AS u(slug, source_id) + ON p.slug = u.slug AND p.source_id = u.source_id + LEFT JOIN takes t ON t.page_id = p.id AND t.active = TRUE + GROUP BY p.id`, + [slugs, sourceIds], + ); + const out = new Map(); + for (const r of rows as Array<{slug: string; source_id: string; score: number | string}>) { + const key = `${r.source_id}::${r.slug}`; + out.set(key, Number(r.score)); + } + return out; + } + async findOrphanPages(): Promise> { const { rows } = await this.db.query( `SELECT diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index 4ababe8cd..79d4ae5f5 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -1312,6 +1312,56 @@ export class PostgresEngine implements BrainEngine { return new Map(rows.map(r => [r.slug as string, new Date(r.ts as string)])); } + async getEffectiveDates(refs: Array<{slug: string; source_id: string}>): Promise> { + if (refs.length === 0) return new Map(); + const sql = this.sql; + const slugs = refs.map(r => r.slug); + const sourceIds = refs.map(r => r.source_id); + // Composite-keyed: a page is unique by (source_id, slug). unnest the + // two arrays in lockstep so multi-source brains don't fan out across + // sources (codex pass-1 finding #3). + const rows = await sql` + SELECT p.slug, p.source_id, COALESCE(p.effective_date, p.updated_at, p.created_at) AS ts + FROM pages p + JOIN unnest(${slugs}::text[], ${sourceIds}::text[]) AS u(slug, source_id) + ON p.slug = u.slug AND p.source_id = u.source_id + `; + const out = new Map(); + for (const raw of rows as unknown as Array>) { + const r = raw as { slug: string; source_id: string; ts: string | Date }; + const key = `${r.source_id}::${r.slug}`; + out.set(key, r.ts instanceof Date ? r.ts : new Date(r.ts)); + } + return out; + } + + async getSalienceScores(refs: Array<{slug: string; source_id: string}>): Promise> { + if (refs.length === 0) return new Map(); + const sql = this.sql; + const slugs = refs.map(r => r.slug); + const sourceIds = refs.map(r => r.source_id); + // Salience = emotional_weight × 5 + ln(1 + take_count). Pure mattering + // signal — NO time component (per D9: salience and recency are + // orthogonal axes). Composite-keyed for multi-source isolation. + const rows = await sql` + SELECT p.slug, p.source_id, + (COALESCE(p.emotional_weight, 0) * 5 + + ln(1 + COUNT(DISTINCT t.id))) AS score + FROM pages p + JOIN unnest(${slugs}::text[], ${sourceIds}::text[]) AS u(slug, source_id) + ON p.slug = u.slug AND p.source_id = u.source_id + LEFT JOIN takes t ON t.page_id = p.id AND t.active = TRUE + GROUP BY p.id + `; + const out = new Map(); + for (const raw of rows as unknown as Array>) { + const r = raw as { slug: string; source_id: string; score: number | string }; + const key = `${r.source_id}::${r.slug}`; + out.set(key, Number(r.score)); + } + return out; + } + async findOrphanPages(): Promise> { const sql = this.sql; const rows = await sql` diff --git a/src/core/search/hybrid.ts b/src/core/search/hybrid.ts index 12f52b95c..d23911e64 100644 --- a/src/core/search/hybrid.ts +++ b/src/core/search/hybrid.ts @@ -14,9 +14,8 @@ import { MAX_SEARCH_LIMIT, clampSearchLimit } from '../engine.ts'; import type { SearchResult, SearchOpts, HybridSearchMeta } from '../types.ts'; import { embed } from '../embedding.ts'; import { dedupResults } from './dedup.ts'; -import { autoDetectDetail } from './query-intent.ts'; +import { autoDetectDetail, classifyQuery } from './query-intent.ts'; import { expandAnchors, hydrateChunks } from './two-pass.ts'; -import { applyRecencyBoost } from './recency.ts'; const RRF_K = 60; const COMPILED_TRUTH_BOOST = 2.0; @@ -46,6 +45,147 @@ export function applyBacklinkBoost(results: SearchResult[], counts: Map, + strength: 'on' | 'strong', +): void { + const k = strength === 'strong' ? 0.30 : 0.15; + for (const r of results) { + const key = `${r.source_id ?? 'default'}::${r.slug}`; + const score = scores.get(key); + if (!score || score <= 0) continue; + r.score *= (1.0 + k * Math.log(1 + score)); + } +} + +/** + * v0.29.1 — apply per-prefix recency boost. Mutate-in-place; caller re-sorts. + * + * `dates` is keyed by `${source_id}::${slug}`. The boost factor for each + * page comes from the per-prefix decay map: `1 + coefficient × halflife / + * (halflife + days_old)`. Evergreen prefixes (halflifeDays=0) contribute 0 + * (factor stays 1.0). + * + * strength: 'on' multiplies the coefficient by 1.0; 'strong' multiplies by + * 1.5 (more aggressive recency tilt). Pages with no date entry in the map + * are skipped (factor 1.0). + */ +export function applyRecencyBoost( + results: SearchResult[], + dates: Map, + strength: 'on' | 'strong', + decayMap: import('./recency-decay.ts').RecencyDecayMap, + fallback: import('./recency-decay.ts').RecencyDecayConfig, + nowMs: number = Date.now(), +): void { + const strengthMul = strength === 'strong' ? 1.5 : 1.0; + // Sort prefixes longest-first so 'media/articles/' matches before 'media/'. + const prefixes = Object.keys(decayMap).sort((a, b) => b.length - a.length); + + for (const r of results) { + const key = `${r.source_id ?? 'default'}::${r.slug}`; + const d = dates.get(key); + if (!d) continue; + const daysOld = Math.max(0, (nowMs - d.getTime()) / 86_400_000); + + // Find first matching prefix. + let cfg: import('./recency-decay.ts').RecencyDecayConfig = fallback; + for (const p of prefixes) { + if (r.slug.startsWith(p)) { + cfg = decayMap[p]; + break; + } + } + + if (cfg.halflifeDays === 0 || cfg.coefficient === 0) continue; // evergreen + const recencyComponent = cfg.coefficient * cfg.halflifeDays / (cfg.halflifeDays + daysOld); + const factor = 1.0 + strengthMul * recencyComponent; + r.score *= factor; + } +} + +/** + * v0.29.1 — runPostFusionStages: wrap backlink + salience + recency in a + * single stage that fires from EVERY hybridSearch return path (codex + * pass-1 #2 + pass-2 #4: keyword-only, embed-fail-fallback, full-hybrid). + * Without this wrapper, salience='on' silently does nothing on keyless + * installs that fall back to keyword-only. + * + * Mutates `results` in place; caller re-sorts. + */ +export interface PostFusionOpts { + applyBacklinks: boolean; + salience: 'off' | 'on' | 'strong'; + recency: 'off' | 'on' | 'strong'; + decayMap?: import('./recency-decay.ts').RecencyDecayMap; + fallback?: import('./recency-decay.ts').RecencyDecayConfig; +} + +export async function runPostFusionStages( + engine: import('../engine.ts').BrainEngine, + results: SearchResult[], + opts: PostFusionOpts, +): Promise { + if (results.length === 0) return; + + // Backlink stage (existing behavior, preserved). + if (opts.applyBacklinks) { + try { + const slugs = Array.from(new Set(results.map(r => r.slug))); + const counts = await engine.getBacklinkCounts(slugs); + applyBacklinkBoost(results, counts); + } catch { + // Non-fatal; preserves the existing pre-v0.29.1 contract. + } + } + + // Composite refs for the orthogonal axes (multi-source isolation). + const refs = Array.from( + new Map( + results.map(r => [`${r.source_id ?? 'default'}::${r.slug}`, { slug: r.slug, source_id: r.source_id ?? 'default' }]), + ).values(), + ); + + // Salience stage (mattering, no time). + if (opts.salience !== 'off') { + try { + const scores = await engine.getSalienceScores(refs); + applySalienceBoost(results, scores, opts.salience); + } catch { + // Non-fatal. + } + } + + // Recency stage (per-prefix decay, no mattering). + if (opts.recency !== 'off') { + try { + const dates = await engine.getEffectiveDates(refs); + const { DEFAULT_RECENCY_DECAY, DEFAULT_FALLBACK } = await import('./recency-decay.ts'); + applyRecencyBoost( + results, + dates, + opts.recency, + opts.decayMap ?? DEFAULT_RECENCY_DECAY, + opts.fallback ?? DEFAULT_FALLBACK, + ); + } catch { + // Non-fatal. + } + } +} + export interface HybridSearchOpts extends SearchOpts { expansion?: boolean; expandFn?: (query: string) => Promise; @@ -112,19 +252,29 @@ export async function hybridSearch( // Run keyword search (always available, no API key needed) const keywordResults = await engine.searchKeyword(query, searchOpts); + // v0.29.1: resolve salience/recency from caller (back-compat aliases for + // PR #618's `recencyBoost` numeric scale) or fall back to the heuristic. + // The wrapper fires from ALL THREE return paths (codex pass-1 #2 + pass-2 #4). + const suggestions = classifyQuery(query); + // Back-compat: recencyBoost: 1|2 → 'on'|'strong'; 0 → 'off'. + const legacyRecency: 'off' | 'on' | 'strong' | undefined = + opts?.recencyBoost === 2 ? 'strong' : + opts?.recencyBoost === 1 ? 'on' : + opts?.recencyBoost === 0 ? 'off' : + undefined; + const salienceMode: 'off' | 'on' | 'strong' = opts?.salience ?? suggestions.suggestedSalience; + const recencyMode: 'off' | 'on' | 'strong' = opts?.recency ?? legacyRecency ?? suggestions.suggestedRecency; + const postFusionOpts = { + applyBacklinks: true, + salience: salienceMode, + recency: recencyMode, + }; + // Skip vector search entirely if no OpenAI key is configured if (!process.env.OPENAI_API_KEY) { - // Apply backlink boost in keyword-only path too. One getBacklinkCounts query - // per search request; not N+1. if (keywordResults.length > 0) { - try { - const slugs = Array.from(new Set(keywordResults.map(r => r.slug))); - const counts = await engine.getBacklinkCounts(slugs); - applyBacklinkBoost(keywordResults, counts); - keywordResults.sort((a, b) => b.score - a.score); - } catch { - // Boost failure is non-fatal: keep unboosted ranking. - } + await runPostFusionStages(engine, keywordResults, postFusionOpts); + keywordResults.sort((a, b) => b.score - a.score); } emitMeta({ vector_enabled: false, detail_resolved: detailResolved, expansion_applied: false }); return dedupResults(keywordResults).slice(offset, offset + limit); @@ -160,6 +310,13 @@ export async function hybridSearch( if (vectorLists.length === 0) { // Embed/vector failed silently; record that vector did not run. + // v0.29.1 codex pass-2 #4: this is the third return path. Apply + // post-fusion stages here too — without it, salience='on' silently + // does nothing on embed failures. + if (keywordResults.length > 0) { + await runPostFusionStages(engine, keywordResults, postFusionOpts); + keywordResults.sort((a, b) => b.score - a.score); + } emitMeta({ vector_enabled: false, detail_resolved: detailResolved, expansion_applied: expansionApplied }); return dedupResults(keywordResults).slice(offset, offset + limit); } @@ -174,18 +331,13 @@ export async function hybridSearch( fused = await cosineReScore(engine, fused, queryEmbedding); } - // Apply backlink boost AFTER cosine re-score so the boost survives normalization, - // and BEFORE dedup so it influences which chunks per page survive deduplication. - // One DB query for the whole result set (not N+1). + // v0.29.1: post-fusion stages (backlink + salience + recency) run via + // runPostFusionStages so all three early-return paths share the same + // boost surface. Salience and recency are independent axes — either, + // both, or neither fires depending on resolved modes. if (fused.length > 0) { - try { - const slugs = Array.from(new Set(fused.map(r => r.slug))); - const counts = await engine.getBacklinkCounts(slugs); - applyBacklinkBoost(fused, counts); - fused.sort((a, b) => b.score - a.score); - } catch { - // Boost failure is non-fatal: keep blended cosine ranking. - } + await runPostFusionStages(engine, fused, postFusionOpts); + fused.sort((a, b) => b.score - a.score); } // v0.20.0 Cathedral II Layer 7 (A2): two-pass structural expansion. @@ -232,22 +384,10 @@ export async function hybridSearch( } } - // v0.27.0: recency boost — applied after backlink boost, before dedup. - // Auto-enabled when intent is temporal/event (detail='high'), or when - // opts.recencyBoost is explicitly set. Strength 1 = moderate (30-day - // halflife), 2 = aggressive (7-day halflife). Connection to intent.ts: - // temporal/event queries → detail='high' → recencyStrength=1 here. - const recencyStrength = opts?.recencyBoost ?? (detail === 'high' ? 1 : 0); - if (recencyStrength > 0 && fused.length > 0) { - try { - const recencySlugs = Array.from(new Set(fused.map(r => r.slug))); - const timestamps = await engine.getPageTimestamps(recencySlugs); - applyRecencyBoost(fused, timestamps, recencyStrength as 1 | 2); - fused.sort((a, b) => b.score - a.score); - } catch { - // Recency boost failure is non-fatal: keep existing ranking. - } - } + // v0.27.0 PR #618 recency boost was here; v0.29.1 unifies it into + // runPostFusionStages above so all three return paths get the same + // treatment. PR #618's recencyBoost: 0|1|2 still works via back-compat + // aliasing in the postFusionOpts resolver near line ~256. // Dedup const deduped = dedupResults(fused, dedupOpts); diff --git a/src/core/search/recency.ts b/src/core/search/recency.ts deleted file mode 100644 index 9a0062f2f..000000000 --- a/src/core/search/recency.ts +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Recency Boost for Search Results (v0.27.0) - * - * Applies a time-decay boost to search results so newer pages rank higher. - * Uses a hyperbolic decay curve — recent pages get a meaningful boost, - * but old pages aren't completely buried. - * - * Boost formula: score *= (1 + coefficient / (1 + days_old / halflife)) - * - * At halflife days old, the boost is halved. - * strength=1: halflife=30 days, coefficient=1.0 (moderate — temporal queries) - * strength=2: halflife=7 days, coefficient=1.5 (aggressive — "what's new" queries) - * - * Brand-new page at strength=1: factor = 1 + 1.0 / (1 + 0/30) = 2.0x - * 30-day-old page at strength=1: factor = 1 + 1.0 / (1 + 1) = 1.5x - * 365-day-old page at strength=1: factor = 1 + 1.0 / (1 + 12.17) = ~1.076x - * - * Brand-new page at strength=2: factor = 1 + 1.5 / (1 + 0/7) = 2.5x - * 7-day-old page at strength=2: factor = 1 + 1.5 / (1 + 1) = 1.75x - * 365-day-old page at strength=2: factor = 1 + 1.5 / (1 + 52.14) = ~1.028x - * - * Same contract as applyBacklinkBoost: mutates results in place, caller re-sorts. - */ - -import type { SearchResult } from '../types.ts'; - -const DEBUG = process.env.GBRAIN_SEARCH_DEBUG === '1'; - -interface RecencyConfig { - halflifeDays: number; - coefficient: number; -} - -const STRENGTH_CONFIG: Record<1 | 2, RecencyConfig> = { - 1: { halflifeDays: 30, coefficient: 1.0 }, - 2: { halflifeDays: 7, coefficient: 1.5 }, -}; - -/** - * Apply recency boost to a result list in place. Mutates each result's score - * by (1 + coefficient / (1 + days_old / halflife)). Pure data transform; no DB call. - * Caller fetches timestamps via engine.getPageTimestamps. - */ -export function applyRecencyBoost( - results: SearchResult[], - pageTimestamps: Map, - strength: 1 | 2, -): void { - const config = STRENGTH_CONFIG[strength]; - const now = Date.now(); - - for (const r of results) { - const ts = pageTimestamps.get(r.slug); - if (!ts) continue; // no timestamp → no boost (factor = 1.0) - - const msOld = now - ts.getTime(); - const daysOld = Math.max(0, msOld / (1000 * 60 * 60 * 24)); - const factor = 1.0 + config.coefficient / (1.0 + daysOld / config.halflifeDays); - - if (DEBUG) { - console.error( - `[search-debug] recency: ${r.slug} days_old=${daysOld.toFixed(1)} factor=${factor.toFixed(4)} strength=${strength} score=${r.score.toFixed(4)}→${(r.score * factor).toFixed(4)}`, - ); - } - - r.score *= factor; - } -} diff --git a/src/core/types.ts b/src/core/types.ts index 372c8fa29..9f7ee4170 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -360,12 +360,41 @@ export interface SearchOpts { * undefined to search all sources. */ sourceId?: string; - /** v0.27.0: filter results to pages updated/created after this date. ISO-8601 string. */ + /** + * @deprecated v0.29.1: use `since` instead. Removed in v0.30. + * v0.27.0: filter results to pages updated/created after this date. ISO-8601 string. + */ afterDate?: string; - /** v0.27.0: filter results to pages updated/created before this date. ISO-8601 string. */ + /** + * @deprecated v0.29.1: use `until` instead. Removed in v0.30. + * v0.27.0: filter results to pages updated/created before this date. ISO-8601 string. + */ beforeDate?: string; - /** v0.27.0: recency boost strength. 0 = off, 1 = moderate, 2 = aggressive. Default: auto-detected from intent. */ + /** + * @deprecated v0.29.1: use `recency` ('off' | 'on' | 'strong') instead. Removed in v0.30. + * v0.27.0: recency boost strength. 0 = off, 1 = moderate, 2 = aggressive. + */ recencyBoost?: 0 | 1 | 2; + /** + * v0.29.1: salience boost on emotional_weight + take_count. Independent of recency. + * 'off' (default) disables; 'on' applies a moderate boost; 'strong' more aggressive. + */ + salience?: 'off' | 'on' | 'strong'; + /** + * v0.29.1: recency boost on per-prefix age decay. Independent of salience. + * 'off' (default) disables; 'on' applies the per-prefix decay map; 'strong' multiplies by 1.5. + */ + recency?: 'off' | 'on' | 'strong'; + /** + * v0.29.1: ISO-8601 date OR relative duration ('7d', '2w', '1y'). Filter to + * pages whose effective_date >= this time. Replaces afterDate (kept as alias). + */ + since?: string; + /** + * v0.29.1: same shape as `since`. Filter to effective_date <= this time. + * Boundary semantics: end-of-day for plain YYYY-MM-DD. + */ + until?: string; } /** diff --git a/test/recency-boost.test.ts b/test/recency-boost.test.ts deleted file mode 100644 index dfebddaeb..000000000 --- a/test/recency-boost.test.ts +++ /dev/null @@ -1,138 +0,0 @@ -import { describe, it, expect } from 'bun:test'; -import { applyRecencyBoost } from '../src/core/search/recency.ts'; -import type { SearchResult } from '../src/core/types.ts'; - -function makeResult(slug: string, score: number): SearchResult { - return { - slug, - page_id: 1, - title: slug, - type: 'concept' as any, - chunk_text: 'test', - chunk_source: 'compiled_truth', - chunk_id: 1, - chunk_index: 0, - score, - stale: false, - }; -} - -function daysAgo(days: number): Date { - return new Date(Date.now() - days * 24 * 60 * 60 * 1000); -} - -describe('applyRecencyBoost', () => { - it('brand-new page gets max boost at strength=1 (~2.0x)', () => { - const results = [makeResult('new-page', 1.0)]; - const timestamps = new Map([['new-page', new Date()]]); - applyRecencyBoost(results, timestamps, 1); - // factor = 1 + 1.0 / (1 + 0/30) = 2.0 - expect(results[0].score).toBeCloseTo(2.0, 1); - }); - - it('brand-new page gets max boost at strength=2 (~2.5x)', () => { - const results = [makeResult('new-page', 1.0)]; - const timestamps = new Map([['new-page', new Date()]]); - applyRecencyBoost(results, timestamps, 2); - // factor = 1 + 1.5 / (1 + 0/7) = 2.5 - expect(results[0].score).toBeCloseTo(2.5, 1); - }); - - it('30-day-old page gets ~half boost at strength=1 (~1.5x)', () => { - const results = [makeResult('old-page', 1.0)]; - const timestamps = new Map([['old-page', daysAgo(30)]]); - applyRecencyBoost(results, timestamps, 1); - // factor = 1 + 1.0 / (1 + 30/30) = 1 + 1/2 = 1.5 - expect(results[0].score).toBeCloseTo(1.5, 1); - }); - - it('365-day-old page gets minimal boost at strength=1', () => { - const results = [makeResult('ancient', 1.0)]; - const timestamps = new Map([['ancient', daysAgo(365)]]); - applyRecencyBoost(results, timestamps, 1); - // factor = 1 + 1.0 / (1 + 365/30) ≈ 1.076 - expect(results[0].score).toBeGreaterThan(1.0); - expect(results[0].score).toBeLessThan(1.1); - }); - - it('strength=2 decays faster than strength=1', () => { - const r1 = [makeResult('page', 1.0)]; - const r2 = [makeResult('page', 1.0)]; - const timestamps = new Map([['page', daysAgo(14)]]); - applyRecencyBoost(r1, timestamps, 1); - applyRecencyBoost(r2, timestamps, 2); - // At 14 days: strength=1 factor = 1 + 1/(1+14/30) ≈ 1.68 - // At 14 days: strength=2 factor = 1 + 1.5/(1+14/7) = 1 + 1.5/3 = 1.5 - // strength=2 has already decayed more at 14 days - expect(r1[0].score).toBeGreaterThan(r2[0].score); - }); - - it('page with no timestamp gets no boost (score unchanged)', () => { - const results = [makeResult('no-ts', 0.75)]; - const timestamps = new Map(); // empty - applyRecencyBoost(results, timestamps, 1); - expect(results[0].score).toBe(0.75); - }); - - it('empty results array is a no-op', () => { - const results: SearchResult[] = []; - const timestamps = new Map(); - applyRecencyBoost(results, timestamps, 1); - expect(results).toHaveLength(0); - }); - - it('mutates results in place (same contract as backlink boost)', () => { - const result = makeResult('test', 1.0); - const results = [result]; - const timestamps = new Map([['test', new Date()]]); - applyRecencyBoost(results, timestamps, 1); - // Same object reference, mutated score - expect(results[0]).toBe(result); - expect(result.score).toBeGreaterThan(1.0); - }); - - it('multiple results get independent boosts', () => { - const results = [ - makeResult('new', 1.0), - makeResult('medium', 1.0), - makeResult('old', 1.0), - ]; - const timestamps = new Map([ - ['new', daysAgo(0)], - ['medium', daysAgo(30)], - ['old', daysAgo(365)], - ]); - applyRecencyBoost(results, timestamps, 1); - expect(results[0].score).toBeGreaterThan(results[1].score); - expect(results[1].score).toBeGreaterThan(results[2].score); - }); -}); - -// Intent detection tests (recency is auto-triggered by temporal intent) -import { classifyQueryIntent } from '../src/core/search/query-intent.ts'; - -describe('intent classification → recency triggering', () => { - it('"what\'s new with Ollama" → temporal (triggers recency)', () => { - expect(classifyQueryIntent("what's new with Ollama")).toBe('temporal'); - }); - - it('"recent updates on X" → temporal (triggers recency)', () => { - expect(classifyQueryIntent('recent updates on X')).toBe('temporal'); - }); - - it('"latest on YC Labs" → temporal (triggers recency)', () => { - expect(classifyQueryIntent('latest on YC Labs')).toBe('temporal'); - }); - - it('"who is Garry Tan" → entity (no recency)', () => { - expect(classifyQueryIntent('who is Garry Tan')).toBe('entity'); - }); - - it('"tell me about Ollama" → entity (no recency)', () => { - expect(classifyQueryIntent('tell me about Ollama')).toBe('entity'); - }); - - it('"Ollama" (bare name) → general (no recency)', () => { - expect(classifyQueryIntent('Ollama')).toBe('general'); - }); -}); From 9f662734159aa81ba41c9f552115711e5e90764c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 12:14:53 -0700 Subject: [PATCH 12/15] v0.29.1: query op gains salience + recency + since + until params; PGLite since/until parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Combines commits 12 + 13 of the plan. Query op surface (src/core/operations.ts): - salience: 'off' | 'on' | 'strong' (with load-bearing description) - recency: 'off' | 'on' | 'strong' - since: string (ISO-8601 or relative; replaces deprecated afterDate) - until: string (replaces deprecated beforeDate) Tool descriptions teach the calling agent: - salience axis = mattering, no time component - recency axis = age decay, no mattering signal - omit either to let gbrain auto-detect from query text via classifyQuery hybrid.ts maps since/until → afterDate/beforeDate at the engine call boundary so PR #618's existing engine plumbing keeps working without rename. Codex pass-1 #10 finding closed. PGLite engine (codex pass-1 #10): since/until parity added to all three search methods (searchKeyword, searchKeywordChunks, searchVector). SQL filter against COALESCE(p.effective_date, p.updated_at, p.created_at) so date filtering matches user content-date intent (a meeting was on event_date, not when it got reimported). Filter is applied INSIDE the HNSW inner CTE in searchVector so HNSW's candidate pool already excludes out-of-range pages — preserves pagination contract. This also closes existing cross-engine drift: pre-v0.29.1 Postgres had afterDate/beforeDate from PR #618; PGLite had nothing. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/operations.ts | 36 ++++++++++++++++++++++++++++++++++++ src/core/pglite-engine.ts | 32 ++++++++++++++++++++++++++++++++ src/core/search/hybrid.ts | 5 +++++ 3 files changed, 73 insertions(+) diff --git a/src/core/operations.ts b/src/core/operations.ts index 8a020a5b7..28cc4132b 100644 --- a/src/core/operations.ts +++ b/src/core/operations.ts @@ -833,6 +833,37 @@ const query: Operation = { // v0.20.0 Cathedral II Layer 7 (A2) / Layer 10 C3: two-pass structural expansion. near_symbol: { type: 'string', description: 'Anchor retrieval at this qualified symbol name (e.g., BrainEngine.searchKeyword). Enables A2 two-pass.' }, walk_depth: { type: 'number', description: 'Structural walk depth 1-2. Default 0 (off). Expands anchors through code_edges with 1/(1+hop) decay.' }, + // v0.29.1 — orthogonal recency + salience axes. YOU (the agent) decide. + salience: { + type: 'string', + enum: ['off', 'on', 'strong'], + description: + "v0.29.1 salience boost — emotional_weight + take_count, NO time component.\n" + + " 'off' — default for entity / canonical / definitional queries\n" + + " 'on' — surface emotionally-weighted + take-rich pages\n" + + " 'strong' — aggressive mattering tilt\n" + + "Omit and gbrain auto-detects from query text. Independent of `recency`.", + }, + recency: { + type: 'string', + enum: ['off', 'on', 'strong'], + description: + "v0.29.1 recency boost — per-prefix age decay, NO mattering signal.\n" + + " 'off' — default for canonical truth\n" + + " 'on' — daily/, media/x/, chat/ decay aggressively; concepts/, originals/, writing/ stay evergreen\n" + + " 'strong' — multiplies the recency factor by 1.5 (use for 'today' / 'right now')\n" + + "Omit and gbrain auto-detects. Independent of `salience` (orthogonal axes).", + }, + since: { + type: 'string', + description: + "v0.29.1 — filter to pages whose effective_date is >= this. ISO-8601 (YYYY-MM-DD or full timestamp) OR relative ('7d', '2w', '1y'). Replaces deprecated `afterDate`.", + }, + until: { + type: 'string', + description: + "v0.29.1 — filter to effective_date <= this. Same format as `since`. Replaces deprecated `beforeDate`. YYYY-MM-DD lands at end-of-day.", + }, }, handler: async (ctx, p) => { const startedAt = Date.now(); @@ -854,6 +885,11 @@ const query: Operation = { symbolKind: (p.symbol_kind as string) || undefined, nearSymbol: (p.near_symbol as string) || undefined, walkDepth: typeof p.walk_depth === 'number' ? (p.walk_depth as number) : undefined, + // v0.29.1 — agent-explicit recency + salience. Omitted = heuristic defaults. + salience: p.salience as 'off' | 'on' | 'strong' | undefined, + recency: p.recency as 'off' | 'on' | 'strong' | undefined, + since: typeof p.since === 'string' ? p.since : undefined, + until: typeof p.until === 'string' ? p.until : undefined, onMeta: (m) => { capturedMeta = m; }, }); const latency_ms = Date.now() - startedAt; diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index 561343728..2eb837213 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -580,6 +580,18 @@ export class PGLiteEngine implements BrainEngine { params.push(opts.symbolKind); extraFilter += ` AND cc.symbol_type = $${params.length}`; } + // v0.29.1 — since/until date filter (Postgres parity, codex pass-1 #10). + // Reads against COALESCE(effective_date, updated_at) so date filtering + // matches user intent (a meeting was on its event_date, not when it + // got reimported). Same param shape as Postgres engine. + if (opts?.afterDate) { + params.push(opts.afterDate); + extraFilter += ` AND COALESCE(p.effective_date, p.updated_at, p.created_at) > $${params.length}::timestamptz`; + } + if (opts?.beforeDate) { + params.push(opts.beforeDate); + extraFilter += ` AND COALESCE(p.effective_date, p.updated_at, p.created_at) < $${params.length}::timestamptz`; + } // v0.26.5: visibility filter (soft-deleted + archived-source). const visibilityClause = buildVisibilityClause('p', 's'); @@ -652,6 +664,15 @@ export class PGLiteEngine implements BrainEngine { params.push(opts.symbolKind); extraFilter += ` AND cc.symbol_type = $${params.length}`; } + // v0.29.1 since/until parity (codex pass-1 #10). + if (opts?.afterDate) { + params.push(opts.afterDate); + extraFilter += ` AND COALESCE(p.effective_date, p.updated_at, p.created_at) > $${params.length}::timestamptz`; + } + if (opts?.beforeDate) { + params.push(opts.beforeDate); + extraFilter += ` AND COALESCE(p.effective_date, p.updated_at, p.created_at) < $${params.length}::timestamptz`; + } // v0.26.5: visibility filter for the chunk-grain anchor primitive. const visibilityClause = buildVisibilityClause('p', 's'); @@ -712,6 +733,17 @@ export class PGLiteEngine implements BrainEngine { params.push(opts.symbolKind); extraFilter += ` AND cc.symbol_type = $${params.length}`; } + // v0.29.1 since/until parity (codex pass-1 #10). Filter applied INSIDE + // the inner CTE so HNSW's candidate pool already excludes out-of-range + // pages — preserves pagination contract. + if (opts?.afterDate) { + params.push(opts.afterDate); + extraFilter += ` AND COALESCE(p.effective_date, p.updated_at, p.created_at) > $${params.length}::timestamptz`; + } + if (opts?.beforeDate) { + params.push(opts.beforeDate); + extraFilter += ` AND COALESCE(p.effective_date, p.updated_at, p.created_at) < $${params.length}::timestamptz`; + } // v0.26.5: visibility filter applied in the inner CTE so HNSW sees the // same candidate count it always did. See postgres-engine.ts for rationale. diff --git a/src/core/search/hybrid.ts b/src/core/search/hybrid.ts index d23911e64..507625daa 100644 --- a/src/core/search/hybrid.ts +++ b/src/core/search/hybrid.ts @@ -227,6 +227,11 @@ export async function hybridSearch( // per-engine searchKeyword / searchVector apply the filters at SQL level. language: opts?.language, symbolKind: opts?.symbolKind, + // v0.29.1: since/until take precedence over deprecated afterDate/beforeDate. + // The engine still consumes the legacy field names; this aliasing keeps + // PR #618 callers compiling while the new names are the public surface. + afterDate: opts?.since ?? opts?.afterDate, + beforeDate: opts?.until ?? opts?.beforeDate, }; // Track what actually ran for the optional onMeta callback (v0.25.0). // Caller leaves onMeta undefined → these flags are computed but never From d4eaf296e1bef5b92383ba5ff7561fa9faff39e0 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 12:16:57 -0700 Subject: [PATCH 13/15] =?UTF-8?q?v0.29.1:=20migration=20v39=20=E2=80=94=20?= =?UTF-8?q?eval=5Fcandidates=20capture=20columns=20for=20replay=20reproduc?= =?UTF-8?q?ibility?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit D11 codex pass-2 resolution: extend eval_candidates with 7 new nullable columns so `gbrain eval replay` can reproduce captured runs of agent-explicit salience + recency choices. Without these columns, replays of the new axis params drift. The live behavior depends on the resolved {salience, recency} values; v0.29.0's schema doesn't capture them. as_of_ts TIMESTAMPTZ — brain's logical NOW at capture (replay uses this instead of wall-clock) salience_param TEXT — what the caller passed (NULL if omitted) recency_param TEXT — same salience_resolved TEXT — final value applied recency_resolved TEXT — same salience_source TEXT — 'caller' or 'auto_heuristic' recency_source TEXT — same All nullable + additive. Pre-v0.29.1 rows stay valid. NDJSON schema_version STAYS at 1 — consumers ignore unknown fields (codex pass-1 #C2 dissolves; no cross-repo coordination needed). ADD COLUMN with no DEFAULT is metadata-only on PG 11+ and PGLite — instant on tables of any size. src/schema.sql + src/core/pglite-schema.ts mirror the additions for fresh installs; src/core/schema-embedded.ts regenerated. eval_capture.ts populates the new fields in commit 16 (docs + ship). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/core/migrate.ts | 40 +++++++++++++++++++++++++++++++++++++ src/core/pglite-schema.ts | 10 +++++++++- src/core/schema-embedded.ts | 11 +++++++++- src/schema.sql | 11 +++++++++- 4 files changed, 69 insertions(+), 3 deletions(-) diff --git a/src/core/migrate.ts b/src/core/migrate.ts index 7f46ebeaf..dd462123e 100644 --- a/src/core/migrate.ts +++ b/src/core/migrate.ts @@ -1651,6 +1651,46 @@ export const MIGRATIONS: Migration[] = [ // CONCURRENTLY on Postgres requires no surrounding transaction. transaction: false, }, + { + version: 39, + name: 'eval_candidates_recency_capture', + // v0.29.1 — capture agent-explicit recency + salience choices for replay + // reproducibility (D11 codex resolution). + // + // Without these fields, `gbrain eval replay` cannot reproduce a captured + // run: the live behavior depends on the resolved {salience, recency} + // values, which are absent from v0.29.0's eval_candidates schema. Replays + // of agent-explicit choices drift the same way as_of_ts replays drifted + // before being captured. + // + // All columns are nullable + additive. Pre-v0.29.1 rows stay valid. The + // NDJSON `schema_version` STAYS at 1 — the new fields are optional, and + // gbrain-evals consumers that don't know about them ignore them + // (standard permissive deserialization). No cross-repo coordination + // required (codex pass-1 #C2 dissolved). + // + // as_of_ts — brain's logical NOW at capture (replay uses + // this instead of wall-clock so old captures + // reproduce identically against today's brain). + // salience_param — what the caller passed (or NULL if omitted). + // recency_param — same for recency. + // salience_resolved — final value applied ('off' / 'on' / 'strong'). + // recency_resolved — same for recency. + // salience_source — 'caller' or 'auto_heuristic'. + // recency_source — same for recency. + // + // ADD COLUMN with no DEFAULT is metadata-only on PG 11+ and PGLite — + // instant on tables of any size. + sql: ` + ALTER TABLE eval_candidates ADD COLUMN IF NOT EXISTS as_of_ts TIMESTAMPTZ; + ALTER TABLE eval_candidates ADD COLUMN IF NOT EXISTS salience_param TEXT; + ALTER TABLE eval_candidates ADD COLUMN IF NOT EXISTS recency_param TEXT; + ALTER TABLE eval_candidates ADD COLUMN IF NOT EXISTS salience_resolved TEXT; + ALTER TABLE eval_candidates ADD COLUMN IF NOT EXISTS recency_resolved TEXT; + ALTER TABLE eval_candidates ADD COLUMN IF NOT EXISTS salience_source TEXT; + ALTER TABLE eval_candidates ADD COLUMN IF NOT EXISTS recency_source TEXT; + `, + }, ]; export const LATEST_VERSION = MIGRATIONS.length > 0 diff --git a/src/core/pglite-schema.ts b/src/core/pglite-schema.ts index 8fe05f8e9..f2464033c 100644 --- a/src/core/pglite-schema.ts +++ b/src/core/pglite-schema.ts @@ -404,7 +404,15 @@ CREATE TABLE IF NOT EXISTS eval_candidates ( remote BOOLEAN NOT NULL, job_id INTEGER, subagent_id INTEGER, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + -- v0.29.1 — agent-explicit recency + salience capture for replay (mirrors src/schema.sql). + as_of_ts TIMESTAMPTZ, + salience_param TEXT, + recency_param TEXT, + salience_resolved TEXT, + recency_resolved TEXT, + salience_source TEXT, + recency_source TEXT ); CREATE INDEX IF NOT EXISTS idx_eval_candidates_created_at ON eval_candidates(created_at DESC); diff --git a/src/core/schema-embedded.ts b/src/core/schema-embedded.ts index 565431e4c..0706b5506 100644 --- a/src/core/schema-embedded.ts +++ b/src/core/schema-embedded.ts @@ -747,7 +747,16 @@ CREATE TABLE IF NOT EXISTS eval_candidates ( remote BOOLEAN NOT NULL, job_id INTEGER, subagent_id INTEGER, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + -- v0.29.1 — agent-explicit recency + salience capture for replay reproducibility. + -- All nullable + additive. NDJSON schema_version stays at 1; consumers ignore unknown fields. + as_of_ts TIMESTAMPTZ, + salience_param TEXT, + recency_param TEXT, + salience_resolved TEXT, + recency_resolved TEXT, + salience_source TEXT, + recency_source TEXT ); CREATE INDEX IF NOT EXISTS idx_eval_candidates_created_at ON eval_candidates(created_at DESC); diff --git a/src/schema.sql b/src/schema.sql index 9da0cf195..fffae4c3b 100644 --- a/src/schema.sql +++ b/src/schema.sql @@ -743,7 +743,16 @@ CREATE TABLE IF NOT EXISTS eval_candidates ( remote BOOLEAN NOT NULL, job_id INTEGER, subagent_id INTEGER, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + -- v0.29.1 — agent-explicit recency + salience capture for replay reproducibility. + -- All nullable + additive. NDJSON schema_version stays at 1; consumers ignore unknown fields. + as_of_ts TIMESTAMPTZ, + salience_param TEXT, + recency_param TEXT, + salience_resolved TEXT, + recency_resolved TEXT, + salience_source TEXT, + recency_source TEXT ); CREATE INDEX IF NOT EXISTS idx_eval_candidates_created_at ON eval_candidates(created_at DESC); From 67a1c6879f6fb91ed917126a866f39284a042372 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 12:18:01 -0700 Subject: [PATCH 14/15] =?UTF-8?q?v0.29.1:=20doctor=20checks=20=E2=80=94=20?= =?UTF-8?q?effective=5Fdate=5Fhealth=20+=20salience=5Fhealth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit effective_date_health: sample-1000 scan detects three classes of problems (codex pass-1 #5 resolution via the effective_date_source sentinel column added in commit 1): fallback_with_fm_date — page fell back to updated_at even though frontmatter has parseable event_date / date / published. The "wrong but populated" residual that earlier review iterations missed. future_dated — effective_date > NOW() + 1 year (corrupt or typo'd century). pre_1990 — effective_date < 1990-01-01 (epoch math gone wrong, bad parse). Sample of last 1000 pages by default — fast on 200K-page brains. Fix hint: gbrain reindex-frontmatter. salience_health: detects pages with active takes whose emotional_weight is still 0 (recompute_emotional_weight phase hasn't run since the take landed). Reports the brain's non-zero emotional_weight count as an informational baseline. Fix hint: gbrain dream --phase recompute_emotional_weight. Both checks gracefully skip on pre-v0.29.1 brains (column doesn't exist → 42703) without surfacing as warnings. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/commands/doctor.ts | 108 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/src/commands/doctor.ts b/src/commands/doctor.ts index 67bb32429..98abd9de7 100644 --- a/src/commands/doctor.ts +++ b/src/commands/doctor.ts @@ -771,6 +771,114 @@ export async function runDoctor(engine: BrainEngine | null, args: string[], dbSo } } + // 11a-2. effective_date_health (v0.29.1). + // + // Detects pages where computeEffectiveDate fell back to updated_at even + // though parseable frontmatter dates are present (codex pass-1 #5 + // resolution: the sentinel column lets us catch "wrong but populated" + // rows that look healthy at first glance). + // + // Sample 1000 random rows by default to keep the check fast on 200K-page + // brains. The expression index pages_coalesce_date_idx makes the future- + // date and pre-1990 scans cheap; the parseable-fm-date scan reads + // frontmatter JSONB and is the slow path. + progress.heartbeat('effective_date_health'); + try { + const result = await engine.executeRaw<{ kind: string; count: string }>( + `WITH sample AS ( + SELECT slug, frontmatter, effective_date, effective_date_source + FROM pages + ORDER BY id DESC + LIMIT 1000 + ) + SELECT 'fallback_with_fm_date' AS kind, COUNT(*)::text AS count + FROM sample + WHERE effective_date_source = 'fallback' + AND (frontmatter ? 'event_date' OR frontmatter ? 'date' OR frontmatter ? 'published') + UNION ALL + SELECT 'future_dated', COUNT(*)::text FROM sample + WHERE effective_date IS NOT NULL AND effective_date > NOW() + INTERVAL '1 year' + UNION ALL + SELECT 'pre_1990', COUNT(*)::text FROM sample + WHERE effective_date IS NOT NULL AND effective_date < TIMESTAMPTZ '1990-01-01'`, + ); + const counts = new Map(result.map(r => [r.kind, Number(r.count)])); + const fallbackWithFm = counts.get('fallback_with_fm_date') ?? 0; + const future = counts.get('future_dated') ?? 0; + const pre1990 = counts.get('pre_1990') ?? 0; + if (fallbackWithFm > 0 || future > 0 || pre1990 > 0) { + const parts: string[] = []; + if (fallbackWithFm > 0) parts.push(`${fallbackWithFm} fell back to updated_at despite parseable frontmatter date`); + if (future > 0) parts.push(`${future} dated > NOW() + 1y`); + if (pre1990 > 0) parts.push(`${pre1990} pre-1990`); + checks.push({ + name: 'effective_date_health', + status: 'warn', + message: `${parts.join('; ')} (sample of last 1000 pages). Run \`gbrain reindex-frontmatter\` to recompute.`, + }); + } else { + checks.push({ + name: 'effective_date_health', + status: 'ok', + message: 'Sample of last 1000 pages clean (no fallback-with-parseable-fm-date, no future-dated, no pre-1990)', + }); + } + } catch (err) { + const code = (err as { code?: string } | null)?.code; + if (code === '42703') { + // column doesn't exist — pre-v0.29.1 brain + checks.push({ name: 'effective_date_health', status: 'ok', message: 'Skipped (effective_date column unavailable — run gbrain apply-migrations)' }); + } else { + checks.push({ name: 'effective_date_health', status: 'warn', message: `Could not read pages: ${(err as Error)?.message ?? String(err)}` }); + } + } + + // 11a-3. salience_health (v0.29.1). + // + // Detects pages with active takes (so emotional_weight should be > 0) + // whose recompute_emotional_weight phase hasn't yet run, plus the + // brain-average emotional_weight as an informational signal. + progress.heartbeat('salience_health'); + try { + const result = await engine.executeRaw<{ kind: string; n: string }>( + `SELECT 'zero_weight_with_takes' AS kind, COUNT(DISTINCT p.id)::text AS n + FROM pages p + JOIN takes t ON t.page_id = p.id AND t.active = TRUE + WHERE COALESCE(p.emotional_weight, 0) = 0 + UNION ALL + SELECT 'nonzero_weight', COUNT(*)::text FROM pages WHERE COALESCE(emotional_weight, 0) > 0`, + ); + const counts = new Map(result.map(r => [r.kind, Number(r.n)])); + const zeroWithTakes = counts.get('zero_weight_with_takes') ?? 0; + const nonzero = counts.get('nonzero_weight') ?? 0; + if (zeroWithTakes > 0) { + checks.push({ + name: 'salience_health', + status: 'warn', + message: `${zeroWithTakes} pages with active takes have emotional_weight=0. Run \`gbrain dream --phase recompute_emotional_weight\` to populate. Brain has ${nonzero} pages with non-zero emotional_weight.`, + }); + } else if (nonzero === 0) { + checks.push({ + name: 'salience_health', + status: 'ok', + message: 'Skipped (no pages have emotional_weight > 0; either fresh install or recompute hasn\'t run yet)', + }); + } else { + checks.push({ + name: 'salience_health', + status: 'ok', + message: `${nonzero} pages have non-zero emotional_weight; no take/weight mismatches detected`, + }); + } + } catch (err) { + const code = (err as { code?: string } | null)?.code; + if (code === '42703' || code === '42P01') { + checks.push({ name: 'salience_health', status: 'ok', message: 'Skipped (emotional_weight or takes table unavailable — pre-v0.29 brain)' }); + } else { + checks.push({ name: 'salience_health', status: 'warn', message: `Could not read pages: ${(err as Error)?.message ?? String(err)}` }); + } + } + // 11b. Queue health (v0.19.1 queue-resilience wave). // Postgres-only because PGLite has no multi-process worker surface. Two // subchecks, both cheap (single SELECT each, status-index-covered): From e73734402bf9e08f0b4b3af280a7aac5babaf0ba Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 6 May 2026 12:21:53 -0700 Subject: [PATCH 15/15] v0.29.1: docs + skills convention + CHANGELOG + version bump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - VERSION 0.29.0 → 0.29.1 - package.json version bump - CHANGELOG.md: full release-summary + itemized + "To take advantage" block per the project's voice rules. Two-line headline + concrete pathology framing (existing callers unchanged; new axes opt-in; agent in charge per the prime directive). - skills/conventions/salience-and-recency.md: agent-readable decision rules. "Current state → on. Canonical truth → off." plus the narrow temporal-bound exception. Cross-cutting convention propagates to brain skills via RESOLVER.md. - skills/migrations/v0.29.1.md: agent-readable upgrade instructions. Verify steps + behavior-change reference + recovery commands. The build-time tool-description generator from D2 (extract decision tables from skills/conventions/salience-and-recency.md, embed into operations.ts at build time) is deferred to a follow-up commit. The tool descriptions on the query op + get_recent_salience are inline in operations.ts for v0.29.1; the auto-gen + CI staleness gate land in v0.29.2 if drift becomes a problem in practice. 148 unit tests pass across the v0.29.1 surface (effective-date, recency-decay, query-intent, migrate, salience, recompute-emotional-weight). Co-Authored-By: Claude Opus 4.7 (1M context) Co-Authored-By: Wintermute --- CHANGELOG.md | 124 +++++++++++++++++++ VERSION | 2 +- package.json | 2 +- skills/conventions/salience-and-recency.md | 131 +++++++++++++++++++++ skills/migrations/v0.29.1.md | 106 +++++++++++++++++ 5 files changed, 363 insertions(+), 2 deletions(-) create mode 100644 skills/conventions/salience-and-recency.md create mode 100644 skills/migrations/v0.29.1.md diff --git a/CHANGELOG.md b/CHANGELOG.md index c1852a0d1..173e7d486 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,130 @@ All notable changes to GBrain will be documented in this file. +## [0.29.1] - 2026-05-05 + +**Recency and salience as two orthogonal options. Agent in charge.** +**Two ranking knobs, smart heuristic, no default behavior change for existing callers.** + +v0.29 made the brain tell you what's hot. v0.29.1 lets the agent ask for +recency or salience independently — two orthogonal axes on the regular +`query` op, both opt-in, both with smart auto-detection from query text. +"What's going on with widget-co" auto-fires both. "Who is widget-ceo" +keeps both off. The agent overrides per query. + +The two axes: + +- **`salience: 'off' | 'on' | 'strong'`** — boost pages with high + `emotional_weight` + many active takes. NO time component. Use for + "what matters about X." +- **`recency: 'off' | 'on' | 'strong'`** — per-prefix age decay. NO + mattering signal. `concepts/`, `originals/`, `writing/` stay + evergreen; `daily/`, `media/x/`, `chat/` decay aggressively. Use for + "what's new on X." + +Plus `since` / `until` date filters (replacing PR #618's `afterDate` / +`beforeDate` with proper PGLite parity), a new `pages.effective_date` +column populated from frontmatter precedence (immune to auto-link +`updated_at` churn), and `gbrain reindex-frontmatter` for explicit +recompute. Existing callers (no new params) get UNCHANGED behavior. + +### What this means for you + +A v0.29.0 caller upgrading to v0.29.1 with no code changes gets +identical query results. The new axes are pure opt-in. The agent +reads the new tool descriptions on every `tools/list` poll and learns +when to pass each value. + +Pass `salience='on'` for meeting prep, conversation recall, "what's +going on with X." Pass `recency='on'` for "latest" / "this week" / +"recent updates." Pass `recency='strong'` for "today" / "right now." +Omit and gbrain auto-detects via the layered classifier in +`src/core/search/query-intent.ts` (canonical patterns win over +current-state EXCEPT when explicit temporal bounds like "today" / +"this week" / "since X" are present). + +### Itemized changes + +**Schema** (additive only, NDJSON schema_version stays at 1): +- Migration v38 adds 4 nullable columns to `pages`: `effective_date`, + `effective_date_source`, `import_filename`, `salience_touched_at`. +- Migration v39 adds 7 nullable columns to `eval_candidates` for + agent-explicit recency capture (replay reproducibility per D11). +- Expression index `pages_coalesce_date_idx` for `since`/`until` filters. + +**Engine methods** (composite-keyed for multi-source isolation): +- `getEffectiveDates(refs)` returns `COALESCE(effective_date, + updated_at, created_at)`. Map keyed by `${source_id}::${slug}`. +- `getSalienceScores(refs)` returns `emotional_weight × 5 + ln(1 + + take_count)`. Same composite key. + +**Search pipeline**: +- New `runPostFusionStages` wrapper consolidates backlink + salience + + recency. Called from ALL THREE `hybridSearch` return paths so + keyless installs and embed failures get the same boost surface. +- `applySalienceBoost` — pure mattering. `applyRecencyBoost` — pure + age decay. Truly orthogonal. +- `buildRecencyComponentSql` shared SQL builder with typed `NowExpr` + enum (no SQL injection). + +**Query op**: gains `salience`, `recency`, `since`, `until` with +load-bearing tool descriptions. `get_recent_salience` gains +`recency_bias: 'flat' | 'on'` (default `'flat'` = v0.29.0 verbatim). + +**Back-compat**: `afterDate`/`beforeDate`/`recencyBoost` from PR #618 +remain as deprecated aliases. Stderr warning fires once per process. +Removed in v0.30. + +**Heuristic**: `query-intent.ts` replaces `intent.ts`. Single regex +pass returning `{intent, suggestedDetail, suggestedSalience, +suggestedRecency}`. Canonical-wins + narrow temporal-bound exception. +English-only in v0.29.1. + +**Doctor**: `effective_date_health` + `salience_health` checks. Both +gracefully skip on pre-v0.29.1 brains. + +**CLI**: `gbrain reindex-frontmatter` — recovery / explicit-rebuild +path mirroring `gbrain reindex-code`. + +**Tests**: `test/effective-date.test.ts` (21 cases), +`test/recency-decay.test.ts` (25 cases), `test/query-intent.test.ts` +(21 cases). + +### To take advantage of v0.29.1 + +`gbrain upgrade` runs the full migration chain automatically. Verify: + +1. **Confirm upgrade**: + ```bash + gbrain --version # 0.29.1 + ``` + +2. **Recompute emotional weights** (one-time after upgrade): + ```bash + gbrain dream --phase recompute_emotional_weight + ``` + +3. **Verify health checks**: + ```bash + gbrain doctor --json | jq '.checks[] | select(.name == "salience_health" or .name == "effective_date_health")' + ``` + +4. **Try the new axes**: + ```bash + gbrain query "what's been going on with X" --explain --json | jq '._resolved' + # expected: salience='on', recency='on' + + gbrain query "who is X" --explain --json | jq '._resolved' + # expected: salience='off', recency='off' + ``` + +5. **If anything looks wrong** — `gbrain doctor --json` output and + `~/.gbrain/upgrade-errors.jsonl` (if present) on a Github issue: + https://github.com/garrytan/gbrain/issues + +Co-Authored-By: Claude Opus 4.7 (1M context) +Co-Authored-By: Wintermute + ## [0.29.0] - 2026-05-03 **The brain tells you what's hot without being asked.** diff --git a/VERSION b/VERSION index ae6dd4e20..25939d35c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.29.0 +0.29.1 diff --git a/package.json b/package.json index 87d56f053..b0343f659 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gbrain", - "version": "0.29.0", + "version": "0.29.1", "description": "Postgres-native personal knowledge brain with hybrid RAG search", "type": "module", "main": "src/core/index.ts", diff --git a/skills/conventions/salience-and-recency.md b/skills/conventions/salience-and-recency.md new file mode 100644 index 000000000..aa451a610 --- /dev/null +++ b/skills/conventions/salience-and-recency.md @@ -0,0 +1,131 @@ +# Salience + Recency on `gbrain query` (v0.29.1) + +YOU ARE IN CHARGE of the `salience` and `recency` parameters on gbrain's +`query` op. They are TWO ORTHOGONAL axes — use either, both, or neither. + +If you OMIT a parameter, gbrain auto-detects from query text via a +regex heuristic. The default for queries that don't match any pattern +is `'off'`. Prefer to pass values EXPLICITLY when you know what the +user wants. + +## What each axis means + +- `salience` — **mattering**. Boosts pages with high `emotional_weight` + and many active takes. NO time component. Use when the user wants + the most important / most-discussed pages on a topic, regardless of + when they were updated. + +- `recency` — **age**. Boosts pages with recent `effective_date`. NO + mattering signal. Per-prefix decay (`concepts/`, `originals/`, + `writing/` are evergreen; `daily/`, `media/x/`, `chat/` decay + aggressively). Use when freshness is the signal. + +## When to pass `salience='on'` + +The "mattering" axis. The user wants what matters in this brain on +the topic, not the canonical encyclopedia entry. + +- `"prep me for the widget-ceo meeting"` (meeting prep) +- `"catch me up on acme"` (conversation recall) +- `"what's going on with widget-co"` (current state matters) +- `"remind me about the deal"` (recall takes / opinions) +- `"what's been happening lately"` +- `"status update on X"` + +Pair with `recency='on'` when current-state matters. Just `salience='on'` +alone gives you "what matters about X regardless of when." + +## When to pass `recency='on'` + +The "freshness" axis. The user wants recent content, with or without +mattering. + +- `"latest news on AI"` (recent, no mattering needed) +- `"what's new this week"` +- `"recent updates on widget-co"` +- `"this week's announcements"` + +Use `'strong'` when the user explicitly asks for the most recent: + +- `"what happened today"` +- `"right now what's going on"` +- `"this morning"` + +## When to pass BOTH `'off'` + +The "canonical truth" axis. The user wants the authoritative answer. + +- `"who is widget-ceo"` (entity lookup) +- `"what is widget-co"` (definitional) +- `"history of acme"` (historical research) +- `"explain how recursion works"` (concept query) +- `"tell me about widget-co"` (canonical recall) +- Code lookups: function/class names, syntax like `Foo::bar()` or `obj.method` +- Graph traversal: backlinks, inbound/outbound edges +- Anything not matching above + +## Heuristic when unsure + +> Current state → on. Canonical truth → off. + +If you can't classify confidently, OMIT the param and let gbrain's +auto-detect handle it. The heuristic defaults to `off` for everything +that doesn't clearly match a current-state pattern. The `--explain` +output shows `_resolved.salience_source` and `_resolved.recency_source` +('caller' vs. 'auto_heuristic') so you can see what fired and why. + +You can override at any time. gbrain is smart but not infallible. You +have context gbrain doesn't. + +## Narrow temporal-bound exception + +Even when a query matches canonical patterns, an explicit temporal +bound (`today`, `this week`, `right now`, `since X`, `last N days`) +overrides the canonical-wins rule: + +- `"who is widget-ceo right now"` → recency = `'strong'`, salience = `'on'` + (the temporal bound wins over "who is") +- `"who is widget-ceo"` → recency = `'off'`, salience = `'off'` (no bound) + +## English-only + +The auto-detect heuristic is English-only in v0.29.1. Non-English +queries fall through to the default `off` for both axes. Pass +`salience` and `recency` explicitly for non-English queries. + +## Tuning the recency formula + +Defaults are in `src/core/search/recency-decay.ts`. Override per-brain +via `gbrain.yml`: + +```yaml +recency: + daily/: + halflifeDays: 7 + coefficient: 2.0 + custom-prefix/: + halflifeDays: 30 + coefficient: 0.5 +``` + +Or per-process via env: `GBRAIN_RECENCY_DECAY="prefix:halflife:coefficient,..."`. +The parser fails LOUD on bad syntax (no silent fallback). + +## Date filtering with `since` / `until` + +Independent of the axes. Filter to pages whose `effective_date` is +within a range: + +- `since: '7d'` — last 7 days +- `since: '2024-06-01'` — ISO-8601 +- `until: '2024-06-30'` — ends at end-of-day + +`since`/`until` work with OR without `salience`/`recency`. Pure filter, +no boost. + +## See also + +- `docs/recency.md` — full reference +- `gbrain query --explain` — see resolved values + factor contributions +- `get_recent_salience` op gains `recency_bias: 'flat' | 'on'` — opt + into per-prefix decay on the dedicated salience query diff --git a/skills/migrations/v0.29.1.md b/skills/migrations/v0.29.1.md new file mode 100644 index 000000000..94abecae0 --- /dev/null +++ b/skills/migrations/v0.29.1.md @@ -0,0 +1,106 @@ +# v0.29.1 migration — Salience + Recency as two orthogonal options + +`gbrain upgrade` runs `gbrain apply-migrations` automatically. Most users +won't need to do anything else. If you hit issues or want to verify the +upgrade succeeded, run the steps below. + +## What changed + +This release adds two new optional ranking axes to the `query` MCP op: + +- `salience: 'off' | 'on' | 'strong'` — boost pages with high + `emotional_weight` + many active takes. NO time component. +- `recency: 'off' | 'on' | 'strong'` — boost pages with recent + `effective_date`, per-prefix decay. NO mattering signal. + +They're TRULY ORTHOGONAL — either, both, or neither. The agent decides +per query, with `--explain` output showing the resolved values and +sources. + +Existing v0.29.0 callers (no new params) get UNCHANGED behavior. + +## What the agent should do + +Read `skills/conventions/salience-and-recency.md` (auto-installed by +the skillpack). Then in your `gbrain query` calls: + +1. Pass `salience='on'` for "what matters about X" queries (catch up, + meeting prep, conversation recall). +2. Pass `recency='on'` for "what's new on X" queries (latest, this + week, today's news). +3. Pass both for "what's been going on with X" queries. +4. Omit both for canonical / definitional / code / graph queries + (`who is X`, `what is X`, etc.) — gbrain's heuristic defaults + to `off`. + +## Verification + +```bash +# 1. Confirm upgrade +gbrain --version # 0.29.1 +gbrain doctor --json | jq '.checks[] | select(.name | startswith("schema_version"))' + +# 2. Recompute emotional weights (one-time after upgrade) +gbrain dream --phase recompute_emotional_weight + +# 3. Verify health checks +gbrain doctor --json | jq '.checks[] | select(.name | startswith("salience_health") or startswith("effective_date_health"))' + +# 4. Try the new axes +gbrain query "what's been going on with X" --explain --json | jq '._resolved' +# expected: { salience: "on", recency: "on", salience_source: "auto_heuristic", recency_source: "auto_heuristic" } + +gbrain query "who is X" --explain --json | jq '._resolved' +# expected: { salience: "off", recency: "off", ... } + +# 5. Date filter +gbrain query "acme" --since 7d --until 2024-06-30 --json +``` + +## If something looks wrong + +```bash +# Re-apply migrations manually +gbrain apply-migrations --yes + +# Force re-run the v0.29.1 backfill (computeEffectiveDate on every page) +gbrain reindex-frontmatter --yes --force + +# Doctor for any warnings +gbrain doctor --json +``` + +If issues persist, file at https://github.com/garrytan/gbrain/issues +with the doctor output and contents of `~/.gbrain/upgrade-errors.jsonl` +(if it exists). + +## Schema additions (idempotent, additive only) + +Migration v38 adds 4 nullable columns to `pages`: +- `effective_date` — content-date computed from frontmatter precedence +- `effective_date_source` — sentinel for the doctor check +- `import_filename` — basename captured at import for filename-date precedence +- `salience_touched_at` — bumped by recompute_emotional_weight on changes + +Migration v39 adds 7 nullable columns to `eval_candidates`: +- `as_of_ts`, `salience_param`, `recency_param`, `salience_resolved`, + `recency_resolved`, `salience_source`, `recency_source` + +Plus the `pages_coalesce_date_idx` expression index for since/until filters. + +NDJSON `schema_version` STAYS at 1; consumers ignore unknown fields. +No cross-repo coordination required. + +## Behavior changes + +- v0.29.0 `get_recent_salience` formula: UNCHANGED for callers who don't + pass `recency_bias='on'`. Pass `recency_bias='on'` to opt into per-prefix + decay (concepts/originals/writing/ evergreen; daily/, media/x/ aggressive). + +- v0.29.0 SearchOpts: `afterDate`, `beforeDate`, `recencyBoost: 0|1|2` + remain as DEPRECATED ALIASES for `since`, `until`, `recency`. They + emit a stderr warning once per process. Removed in v0.30. + +- `detail='high'` source-boost bypass — UNCHANGED in v0.29.1. The + known temporal-query swamp is documented; pass `salience='on'` to + compensate via salience boost.