From 4efe907471a11d1dbc0fb39ae0cbe09cf33717bb Mon Sep 17 00:00:00 2001 From: Jake Browning Date: Thu, 21 May 2026 21:06:25 -0500 Subject: [PATCH] Enhance scientific RAG citations and retrieval --- .../utils/server/scientific-rag.test.ts | 111 +++++++++ ui/pages/api/fetch-documents.ts | 29 ++- ui/pages/api/inject-documents.ts | 53 ++-- ui/pages/api/rag-chat.ts | 29 ++- ui/utils/server/scientific-rag.ts | 230 ++++++++++++++++++ 5 files changed, 407 insertions(+), 45 deletions(-) create mode 100644 ui/__tests__/utils/server/scientific-rag.test.ts create mode 100644 ui/utils/server/scientific-rag.ts diff --git a/ui/__tests__/utils/server/scientific-rag.test.ts b/ui/__tests__/utils/server/scientific-rag.test.ts new file mode 100644 index 0000000..1fbdfbb --- /dev/null +++ b/ui/__tests__/utils/server/scientific-rag.test.ts @@ -0,0 +1,111 @@ +import { + buildScientificChunkMetadata, + createCitationKey, + detectScientificSection, + formatRetrievedScientificSources, + formatSourcesForPrompt, + slugifyCitationPart, +} from '@/utils/server/scientific-rag'; + +import { describe, expect, it } from 'vitest'; + +describe('scientific RAG helpers', () => { + it('detects common scientific sections from chunk text', () => { + expect(detectScientificSection('Abstract\nWe study retrieval.')).toBe('abstract'); + expect(detectScientificSection('Materials and Methods: cohort details')).toBe('methods'); + expect(detectScientificSection('Results\nAccuracy improved.')).toBe('results'); + expect(detectScientificSection('Unlabeled paragraph')).toBe('body'); + }); + + it('creates compact stable citation keys', () => { + expect(slugifyCitationPart('A Study: Retrieval & Citations!')).toBe( + 'a-study-retrieval-citations', + ); + expect(createCitationKey('A Study: Retrieval & Citations!', 3, 2)).toBe( + 'a-study-retrieval-citations:p3:c2', + ); + }); + + it('preserves upload and Semantic Scholar metadata for indexed chunks', () => { + const uploaded = buildScientificChunkMetadata( + { + pageContent: 'Discussion\nScientific context.', + metadata: { + source: '/tmp/paper.pdf', + pdf: { info: { Title: 'Paper Title' } }, + loc: { pageNumber: 8 }, + }, + }, + 0, + ); + + expect(uploaded).toMatchObject({ + title: 'Paper Title', + page: 8, + section: 'discussion', + sourceType: 'upload', + citationKey: 'paper-title:p8:c1', + }); + + const reference = buildScientificChunkMetadata( + { + pageContent: 'Abstract\nReference content.', + metadata: { + title: 'Scholar Paper', + semanticScholarId: 'S2-123', + semanticScholarUrl: 'https://semanticscholar.org/paper/S2-123', + doi: '10.1000/example', + page: 2, + }, + }, + 3, + ); + + expect(reference).toMatchObject({ + title: 'Scholar Paper', + sourceType: 'semantic_scholar', + semanticScholarId: 'S2-123', + doi: '10.1000/example', + citationKey: 'scholar-paper:p2:c4', + }); + }); + + it('formats Chroma results into citation-first prompt context', () => { + const sources = formatRetrievedScientificSources({ + documents: [['Claim one', 'Claim two']], + metadatas: [ + [ + { + title: 'Paper A', + page: 1, + chunk: 1, + section: 'methods', + citationKey: 'paper-a:p1:c1', + doi: '10.1000/a', + }, + { + title: 'Paper B', + page: 4, + chunk: 2, + section: 'results', + }, + ], + ], + distances: [[0.12, 0.34]], + }); + + expect(sources).toHaveLength(2); + expect(sources[0]).toMatchObject({ + citationKey: 'paper-a:p1:c1', + distance: 0.12, + doi: '10.1000/a', + }); + expect(sources[1].citationKey).toBe('paper-b:p4:c2'); + + const promptContext = formatSourcesForPrompt(sources); + + expect(promptContext).toContain('Source 1 [paper-a:p1:c1]'); + expect(promptContext).toContain('distance: 0.1200'); + expect(promptContext).toContain('Section: results'); + }); +}); diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..78c6488 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,13 +1,26 @@ import type { NextApiRequest, NextApiResponse } from "next"; + +import { + formatRetrievedScientificSources, + formatSourcesForPrompt, +} from '@/utils/server/scientific-rag'; import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { + if (req.method !== 'POST') { + return res.status(405).end(); + } + const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const query = req.body.input; + const query = String(req.body.input ?? '').trim(); + + if (!query) { + return res.status(400).json({ error: 'A retrieval query is required' }); + } const embedder = new TransformersEmbeddingFunction(); @@ -15,11 +28,17 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) // query the collection const results = await collection.query({ - nResults: 4, + nResults: Math.min(Number(req.body.nResults ?? 6), 10), queryTexts: [query] }) - res.status(200).json(results); + const sources = formatRetrievedScientificSources(results); + + res.status(200).json({ + ...results, + sources, + promptContext: formatSourcesForPrompt(sources), + }); } catch (error) { if (error instanceof Error) { console.error('Error message:', error.message); @@ -29,4 +48,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } res.status(500).json({ error: 'An unexpected error occurred :(' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..ec3b342 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -1,11 +1,16 @@ import type { NextApiRequest, NextApiResponse } from 'next'; +import { + SCIENTIFIC_TEXT_SEPARATORS, + ScientificDocumentChunk, + buildScientificChunkMetadata, +} from '@/utils/server/scientific-rag'; import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; +import type { File } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; -import path from 'path'; import { v4 as uuidv4 } from 'uuid'; export const config = { @@ -33,17 +38,21 @@ export default async function handler( path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); + const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : (files.pdf as File | undefined); - const originalDocs = await loader.load(); + if (!pdfFile?.filepath) { + return res.status(400).json({ error: 'A PDF file is required' }); + } - console.log(JSON.stringify(originalDocs)); + const loader = new PDFLoader(pdfFile.filepath); + const originalDocs = await loader.load(); const splitter = new RecursiveCharacterTextSplitter({ - chunkSize: 500, - chunkOverlap: 100, - }); + chunkSize: 900, + chunkOverlap: 150, + separators: SCIENTIFIC_TEXT_SEPARATORS, + }); const docs = await splitter.splitDocuments(originalDocs); @@ -75,32 +84,18 @@ export default async function handler( } } -function processDocuments(docs: any) { - const ids = []; - const metadatas = []; - const documentContents = []; +function processDocuments(docs: ScientificDocumentChunk[]) { + const ids: string[] = []; + const metadatas: Array> = []; + const documentContents: string[] = []; - for (const document of docs) { - // Generate an ID for each document, or use some existing unique identifier + docs.forEach((document, index) => { const id = uuidv4(); ids.push(id); - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; - - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; - metadatas.push(metadata); - - // Add the page content to the documents array + metadatas.push(buildScientificChunkMetadata(document, index)); documentContents.push(document.pageContent); - } + }); return { ids, metadatas, documentContents }; } diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..615a841 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,5 +1,9 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; +import { + ScientificRetrievedSource, + formatSourcesForPrompt, +} from '@/utils/server/scientific-rag'; import { codeBlock, oneLine } from 'common-tags' import { ChatBody, Message } from '@/types/chat'; @@ -17,11 +21,10 @@ export const config = { // Function to fetch and format documents async function fetchAndFormatDocuments(lastMessageContent: string) { try { - console.log("fetching documents") const response = await fetch('http://localhost:3000/api/fetch-documents', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ input: lastMessageContent }), + body: JSON.stringify({ input: lastMessageContent, nResults: 6 }), }); if (!response.ok) { @@ -29,13 +32,12 @@ async function fetchAndFormatDocuments(lastMessageContent: string) { } const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); - console.log(result); + if (Array.isArray(data.sources)) { + return formatSourcesForPrompt(data.sources as ScientificRetrievedSource[]); + } - return result; + return data.promptContext ?? 'No matching sources were retrieved.'; } catch (error) { console.error('Error fetching and formatting documents:', error); @@ -64,8 +66,9 @@ const handler = async (req: Request): Promise => { ${oneLine` You are a very enthusiastic AI assistant who loves to help people! Given the following information from - relevant documentation, answer the user's question using - only that information, outputted in markdown format. + relevant scientific documentation, answer the user's + question using only that information, outputted in + markdown format. `} ${oneLine` @@ -75,7 +78,8 @@ const handler = async (req: Request): Promise => { `} ${oneLine` - Always include citations from the documentation. + Every factual claim must include one or more citation + keys in square brackets exactly as provided in the sources. `} `; @@ -130,7 +134,10 @@ const handler = async (req: Request): Promise => { - Prefer splitting your response into multiple paragraphs. `} ${oneLine` - - Output as markdown with citations based on the documentation. + - Output as markdown with citation keys from the source list. + `} + ${oneLine` + - Prefer lower-distance sources when sources disagree. `} `, }, diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts new file mode 100644 index 0000000..5d363a8 --- /dev/null +++ b/ui/utils/server/scientific-rag.ts @@ -0,0 +1,230 @@ +type MetadataValue = string | number | boolean; + +export type ScientificMetadata = Record; + +export interface ScientificDocumentChunk { + pageContent: string; + metadata?: { + source?: string; + pdf?: { + info?: { + Title?: string; + }; + }; + loc?: { + pageNumber?: number; + }; + title?: string; + page?: number; + chunk?: number; + section?: string; + semanticScholarId?: string; + semanticScholarUrl?: string; + doi?: string; + url?: string; + }; +} + +export type ScientificChunkMetadata = ScientificMetadata; + +export interface ScientificRetrievedSource { + citationKey: string; + title: string; + page: number; + section: string; + content: string; + distance?: number; + sourceType?: string; + semanticScholarId?: string; + semanticScholarUrl?: string; + doi?: string; + url?: string; +} + +interface ChromaQueryResults { + documents?: Array | null> | null; + metadatas?: Array | null> | null; + distances?: Array | null> | null; +} + +const SCIENTIFIC_SECTION_PATTERNS: Array<[string, RegExp]> = [ + ['abstract', /^\s*(abstract|summary)\b[:.\-\s]*/i], + ['introduction', /^\s*(introduction|background)\b[:.\-\s]*/i], + ['methods', /^\s*(methods?|materials and methods|methodology|experimental setup)\b[:.\-\s]*/i], + ['results', /^\s*(results?|findings)\b[:.\-\s]*/i], + ['discussion', /^\s*(discussion|analysis)\b[:.\-\s]*/i], + ['conclusion', /^\s*(conclusions?|future work)\b[:.\-\s]*/i], + ['references', /^\s*(references|bibliography|works cited)\b[:.\-\s]*/i], +]; + +export const SCIENTIFIC_TEXT_SEPARATORS = [ + '\nAbstract', + '\nABSTRACT', + '\nIntroduction', + '\nINTRODUCTION', + '\nMethods', + '\nMaterials and Methods', + '\nMethodology', + '\nResults', + '\nDiscussion', + '\nConclusion', + '\nReferences', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +function metadataValueToString(value: unknown): string | undefined { + if (typeof value === 'string') { + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; + } + + if (typeof value === 'number' || typeof value === 'boolean') { + return String(value); + } + + return undefined; +} + +export function slugifyCitationPart(value: string): string { + const slug = value + .normalize('NFKD') + .replace(/[\u0300-\u036f]/g, '') + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, ''); + + return slug.length > 0 ? slug.slice(0, 80) : 'source'; +} + +export function normalizeTitle(document: ScientificDocumentChunk): string { + const metadataTitle = document.metadata?.title; + const pdfTitle = document.metadata?.pdf?.info?.Title; + const source = document.metadata?.source; + + return ( + metadataValueToString(metadataTitle) ?? + metadataValueToString(pdfTitle) ?? + metadataValueToString(source)?.split(/[\\/]/).pop() ?? + 'Untitled scientific source' + ); +} + +export function detectScientificSection(text: string, fallback = 'body'): string { + const firstLine = text.split('\n').find((line) => line.trim().length > 0) ?? ''; + + for (const [section, pattern] of SCIENTIFIC_SECTION_PATTERNS) { + if (pattern.test(firstLine)) { + return section; + } + } + + return fallback; +} + +export function createCitationKey(title: string, page: number, chunk: number): string { + return `${slugifyCitationPart(title)}:p${Math.max(page, 1)}:c${Math.max(chunk, 1)}`; +} + +export function buildScientificChunkMetadata( + document: ScientificDocumentChunk, + chunkIndex: number, +): ScientificChunkMetadata { + const metadata = document.metadata ?? {}; + const title = normalizeTitle(document); + const page = Number(metadata.page ?? metadata.loc?.pageNumber ?? 1); + const chunk = Number(metadata.chunk ?? chunkIndex + 1); + const semanticScholarId = metadataValueToString(metadata.semanticScholarId); + const semanticScholarUrl = metadataValueToString(metadata.semanticScholarUrl); + const doi = metadataValueToString(metadata.doi); + const url = metadataValueToString(metadata.url); + const source = metadataValueToString(metadata.source) ?? semanticScholarUrl ?? url ?? title; + const sourceType = semanticScholarId || semanticScholarUrl ? 'semantic_scholar' : 'upload'; + const section = + metadataValueToString(metadata.section) ?? detectScientificSection(document.pageContent); + + return { + title, + page: Number.isFinite(page) && page > 0 ? page : 1, + source, + sourceType, + section, + chunk: Number.isFinite(chunk) && chunk > 0 ? chunk : chunkIndex + 1, + citationKey: createCitationKey(title, page, chunk), + ...(semanticScholarId ? { semanticScholarId } : {}), + ...(semanticScholarUrl ? { semanticScholarUrl } : {}), + ...(doi ? { doi } : {}), + ...(url ? { url } : {}), + }; +} + +export function formatRetrievedScientificSources(results: ChromaQueryResults): ScientificRetrievedSource[] { + const documents = results.documents?.[0] ?? []; + const metadatas = results.metadatas?.[0] ?? []; + const distances = results.distances?.[0] ?? []; + + return documents.flatMap((content, index) => { + if (!content) { + return []; + } + + const metadata = metadatas[index] ?? {}; + const title = metadataValueToString(metadata.title) ?? 'Untitled scientific source'; + const page = Number(metadata.page ?? 1); + const chunk = Number(metadata.chunk ?? index + 1); + const citationKey = + metadataValueToString(metadata.citationKey) ?? + createCitationKey(title, Number.isFinite(page) ? page : 1, Number.isFinite(chunk) ? chunk : index + 1); + const distance = distances[index]; + + return [ + { + citationKey, + title, + page: Number.isFinite(page) && page > 0 ? page : 1, + section: metadataValueToString(metadata.section) ?? 'body', + content, + ...(typeof distance === 'number' ? { distance } : {}), + ...(metadataValueToString(metadata.sourceType) + ? { sourceType: metadataValueToString(metadata.sourceType) } + : {}), + ...(metadataValueToString(metadata.semanticScholarId) + ? { semanticScholarId: metadataValueToString(metadata.semanticScholarId) } + : {}), + ...(metadataValueToString(metadata.semanticScholarUrl) + ? { semanticScholarUrl: metadataValueToString(metadata.semanticScholarUrl) } + : {}), + ...(metadataValueToString(metadata.doi) ? { doi: metadataValueToString(metadata.doi) } : {}), + ...(metadataValueToString(metadata.url) ? { url: metadataValueToString(metadata.url) } : {}), + }, + ]; + }); +} + +export function formatSourcesForPrompt(sources: ScientificRetrievedSource[]): string { + if (sources.length === 0) { + return 'No matching sources were retrieved.'; + } + + return sources + .map((source, index) => { + const distance = + typeof source.distance === 'number' ? `, distance: ${source.distance.toFixed(4)}` : ''; + const external = + source.doi || source.semanticScholarId || source.semanticScholarUrl || source.url + ? `, external: ${source.doi ?? source.semanticScholarId ?? source.semanticScholarUrl ?? source.url}` + : ''; + + return [ + `Source ${index + 1} [${source.citationKey}]`, + `Title: ${source.title}`, + `Page: ${source.page}`, + `Section: ${source.section}${distance}${external}`, + `Content: ${source.content}`, + ].join('\n'); + }) + .join('\n\n'); +}