From 4efe907471a11d1dbc0fb39ae0cbe09cf33717bb Mon Sep 17 00:00:00 2001
From: Jake Browning <jp88soccer@gmail.com>
Date: Thu, 21 May 2026 21:06:25 -0500
Subject: [PATCH] Enhance scientific RAG citations and retrieval

---
 .../utils/server/scientific-rag.test.ts       | 111 +++++++++
 ui/pages/api/fetch-documents.ts               |  29 ++-
 ui/pages/api/inject-documents.ts              |  53 ++--
 ui/pages/api/rag-chat.ts                      |  29 ++-
 ui/utils/server/scientific-rag.ts             | 230 ++++++++++++++++++
 5 files changed, 407 insertions(+), 45 deletions(-)
 create mode 100644 ui/__tests__/utils/server/scientific-rag.test.ts
 create mode 100644 ui/utils/server/scientific-rag.ts

diff --git a/ui/__tests__/utils/server/scientific-rag.test.ts b/ui/__tests__/utils/server/scientific-rag.test.ts
new file mode 100644
index 0000000..1fbdfbb
--- /dev/null
+++ b/ui/__tests__/utils/server/scientific-rag.test.ts
@@ -0,0 +1,111 @@
+import {
+  buildScientificChunkMetadata,
+  createCitationKey,
+  detectScientificSection,
+  formatRetrievedScientificSources,
+  formatSourcesForPrompt,
+  slugifyCitationPart,
+} from '@/utils/server/scientific-rag';
+
+import { describe, expect, it } from 'vitest';
+
+describe('scientific RAG helpers', () => {
+  it('detects common scientific sections from chunk text', () => {
+    expect(detectScientificSection('Abstract\nWe study retrieval.')).toBe('abstract');
+    expect(detectScientificSection('Materials and Methods: cohort details')).toBe('methods');
+    expect(detectScientificSection('Results\nAccuracy improved.')).toBe('results');
+    expect(detectScientificSection('Unlabeled paragraph')).toBe('body');
+  });
+
+  it('creates compact stable citation keys', () => {
+    expect(slugifyCitationPart('A Study: Retrieval & Citations!')).toBe(
+      'a-study-retrieval-citations',
+    );
+    expect(createCitationKey('A Study: Retrieval & Citations!', 3, 2)).toBe(
+      'a-study-retrieval-citations:p3:c2',
+    );
+  });
+
+  it('preserves upload and Semantic Scholar metadata for indexed chunks', () => {
+    const uploaded = buildScientificChunkMetadata(
+      {
+        pageContent: 'Discussion\nScientific context.',
+        metadata: {
+          source: '/tmp/paper.pdf',
+          pdf: { info: { Title: 'Paper Title' } },
+          loc: { pageNumber: 8 },
+        },
+      },
+      0,
+    );
+
+    expect(uploaded).toMatchObject({
+      title: 'Paper Title',
+      page: 8,
+      section: 'discussion',
+      sourceType: 'upload',
+      citationKey: 'paper-title:p8:c1',
+    });
+
+    const reference = buildScientificChunkMetadata(
+      {
+        pageContent: 'Abstract\nReference content.',
+        metadata: {
+          title: 'Scholar Paper',
+          semanticScholarId: 'S2-123',
+          semanticScholarUrl: 'https://semanticscholar.org/paper/S2-123',
+          doi: '10.1000/example',
+          page: 2,
+        },
+      },
+      3,
+    );
+
+    expect(reference).toMatchObject({
+      title: 'Scholar Paper',
+      sourceType: 'semantic_scholar',
+      semanticScholarId: 'S2-123',
+      doi: '10.1000/example',
+      citationKey: 'scholar-paper:p2:c4',
+    });
+  });
+
+  it('formats Chroma results into citation-first prompt context', () => {
+    const sources = formatRetrievedScientificSources({
+      documents: [['Claim one', 'Claim two']],
+      metadatas: [
+        [
+          {
+            title: 'Paper A',
+            page: 1,
+            chunk: 1,
+            section: 'methods',
+            citationKey: 'paper-a:p1:c1',
+            doi: '10.1000/a',
+          },
+          {
+            title: 'Paper B',
+            page: 4,
+            chunk: 2,
+            section: 'results',
+          },
+        ],
+      ],
+      distances: [[0.12, 0.34]],
+    });
+
+    expect(sources).toHaveLength(2);
+    expect(sources[0]).toMatchObject({
+      citationKey: 'paper-a:p1:c1',
+      distance: 0.12,
+      doi: '10.1000/a',
+    });
+    expect(sources[1].citationKey).toBe('paper-b:p4:c2');
+
+    const promptContext = formatSourcesForPrompt(sources);
+
+    expect(promptContext).toContain('Source 1 [paper-a:p1:c1]');
+    expect(promptContext).toContain('distance: 0.1200');
+    expect(promptContext).toContain('Section: results');
+  });
+});
diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts
index 9304e48..78c6488 100644
--- a/ui/pages/api/fetch-documents.ts
+++ b/ui/pages/api/fetch-documents.ts
@@ -1,13 +1,26 @@
 import type { NextApiRequest, NextApiResponse } from "next";
+
+import {
+  formatRetrievedScientificSources,
+  formatSourcesForPrompt,
+} from '@/utils/server/scientific-rag';
 import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";
 
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
   try {
+    if (req.method !== 'POST') {
+      return res.status(405).end();
+    }
+
     const client = new ChromaClient({
-      path: "http://chroma-server:8000",
+      path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
     });
 
-    const query = req.body.input;
+    const query = String(req.body.input ?? '').trim();
+
+    if (!query) {
+      return res.status(400).json({ error: 'A retrieval query is required' });
+    }
 
     const embedder = new TransformersEmbeddingFunction();
 
@@ -15,11 +28,17 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
 
   // query the collection
   const results = await collection.query({
-      nResults: 4, 
+      nResults: Math.min(Number(req.body.nResults ?? 6), 10),
       queryTexts: [query]
   }) 
 
-    res.status(200).json(results);
+    const sources = formatRetrievedScientificSources(results);
+
+    res.status(200).json({
+      ...results,
+      sources,
+      promptContext: formatSourcesForPrompt(sources),
+    });
   } catch (error) {
     if (error instanceof Error) {
       console.error('Error message:', error.message);
@@ -29,4 +48,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
     }
     res.status(500).json({ error: 'An unexpected error occurred :(' });
   }
-}
\ No newline at end of file
+}
diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts
index 532a635..ec3b342 100644
--- a/ui/pages/api/inject-documents.ts
+++ b/ui/pages/api/inject-documents.ts
@@ -1,11 +1,16 @@
 import type { NextApiRequest, NextApiResponse } from 'next';
 
+import {
+  SCIENTIFIC_TEXT_SEPARATORS,
+  ScientificDocumentChunk,
+  buildScientificChunkMetadata,
+} from '@/utils/server/scientific-rag';
 import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
 import { IncomingForm } from 'formidable';
+import type { File } from 'formidable';
 import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
-import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 
-import path from 'path';
 import { v4 as uuidv4 } from 'uuid';
 
 export const config = {
@@ -33,17 +38,21 @@ export default async function handler(
         path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
       });
 
-      const loader = new PDFLoader(files.pdf[0].filepath);
+      const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : (files.pdf as File | undefined);
 
-      const originalDocs = await loader.load();
+      if (!pdfFile?.filepath) {
+        return res.status(400).json({ error: 'A PDF file is required' });
+      }
 
-      console.log(JSON.stringify(originalDocs));
+      const loader = new PDFLoader(pdfFile.filepath);
 
+      const originalDocs = await loader.load();
 
       const splitter = new RecursiveCharacterTextSplitter({
-        chunkSize: 500,
-        chunkOverlap: 100,
-      });      
+        chunkSize: 900,
+        chunkOverlap: 150,
+        separators: SCIENTIFIC_TEXT_SEPARATORS,
+      });
 
       const docs = await splitter.splitDocuments(originalDocs);
  
@@ -75,32 +84,18 @@ export default async function handler(
   }
 }
 
-function processDocuments(docs: any) {
-  const ids = [];
-  const metadatas = [];
-  const documentContents = [];
+function processDocuments(docs: ScientificDocumentChunk[]) {
+  const ids: string[] = [];
+  const metadatas: Array<Record<string, string | number | boolean>> = [];
+  const documentContents: string[] = [];
 
-  for (const document of docs) {
-    // Generate an ID for each document, or use some existing unique identifier
+  docs.forEach((document, index) => {
     const id = uuidv4();
     ids.push(id);
 
-    const fallbackTitle = path.basename(document.metadata.source);
-    const titleFromMetadata = document.metadata.pdf.info.Title;
-
-    const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;
-
-  
-    const metadata = {
-      title: title,
-      page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
-      source: document.metadata.source, // Define this function to extract verse info
-    };
-    metadatas.push(metadata);
-
-    // Add the page content to the documents array
+    metadatas.push(buildScientificChunkMetadata(document, index));
     documentContents.push(document.pageContent);
-  }
+  });
 
   return { ids, metadatas, documentContents };
 }
diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts
index ce84d67..615a841 100644
--- a/ui/pages/api/rag-chat.ts
+++ b/ui/pages/api/rag-chat.ts
@@ -1,5 +1,9 @@
 import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const';
 import { OpenAIError, OpenAIStream } from '@/utils/server';
+import {
+  ScientificRetrievedSource,
+  formatSourcesForPrompt,
+} from '@/utils/server/scientific-rag';
 import { codeBlock, oneLine } from 'common-tags'
 
 import { ChatBody, Message } from '@/types/chat';
@@ -17,11 +21,10 @@ export const config = {
 // Function to fetch and format documents
 async function fetchAndFormatDocuments(lastMessageContent: string) {
   try {
-    console.log("fetching documents")
     const response = await fetch('http://localhost:3000/api/fetch-documents', {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ input: lastMessageContent }),
+      body: JSON.stringify({ input: lastMessageContent, nResults: 6 }),
     });
     
     if (!response.ok) {
@@ -29,13 +32,12 @@ async function fetchAndFormatDocuments(lastMessageContent: string) {
     }
 
     const data = await response.json();
-    const result = data.metadatas[0].map((metadata: any, index: number) => {
-      return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`;
-    }).join('');
 
-    console.log(result);
+    if (Array.isArray(data.sources)) {
+      return formatSourcesForPrompt(data.sources as ScientificRetrievedSource[]);
+    }
 
-    return result;
+    return data.promptContext ?? 'No matching sources were retrieved.';
 
   } catch (error) {
     console.error('Error fetching and formatting documents:', error);
@@ -64,8 +66,9 @@ const handler = async (req: Request): Promise<Response> => {
     ${oneLine`
       You are a very enthusiastic AI assistant  who loves
       to help people! Given the following information from
-      relevant documentation, answer the user's question using
-      only that information, outputted in markdown format.
+      relevant scientific documentation, answer the user's
+      question using only that information, outputted in
+      markdown format.
     `}
 
     ${oneLine`
@@ -75,7 +78,8 @@ const handler = async (req: Request): Promise<Response> => {
     `}
     
     ${oneLine`
-      Always include citations from the documentation.
+      Every factual claim must include one or more citation
+      keys in square brackets exactly as provided in the sources.
     `}
   `;
 
@@ -130,7 +134,10 @@ const handler = async (req: Request): Promise<Response> => {
             - Prefer splitting your response into multiple paragraphs.
           `}
           ${oneLine`
-            - Output as markdown with citations based on the documentation.
+            - Output as markdown with citation keys from the source list.
+          `}
+          ${oneLine`
+            - Prefer lower-distance sources when sources disagree.
           `}
         `,
       },
diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts
new file mode 100644
index 0000000..5d363a8
--- /dev/null
+++ b/ui/utils/server/scientific-rag.ts
@@ -0,0 +1,230 @@
+type MetadataValue = string | number | boolean;
+
+export type ScientificMetadata = Record<string, MetadataValue>;
+
+export interface ScientificDocumentChunk {
+  pageContent: string;
+  metadata?: {
+    source?: string;
+    pdf?: {
+      info?: {
+        Title?: string;
+      };
+    };
+    loc?: {
+      pageNumber?: number;
+    };
+    title?: string;
+    page?: number;
+    chunk?: number;
+    section?: string;
+    semanticScholarId?: string;
+    semanticScholarUrl?: string;
+    doi?: string;
+    url?: string;
+  };
+}
+
+export type ScientificChunkMetadata = ScientificMetadata;
+
+export interface ScientificRetrievedSource {
+  citationKey: string;
+  title: string;
+  page: number;
+  section: string;
+  content: string;
+  distance?: number;
+  sourceType?: string;
+  semanticScholarId?: string;
+  semanticScholarUrl?: string;
+  doi?: string;
+  url?: string;
+}
+
+interface ChromaQueryResults {
+  documents?: Array<Array<string | null> | null> | null;
+  metadatas?: Array<Array<ScientificMetadata | null> | null> | null;
+  distances?: Array<Array<number | null> | null> | null;
+}
+
+const SCIENTIFIC_SECTION_PATTERNS: Array<[string, RegExp]> = [
+  ['abstract', /^\s*(abstract|summary)\b[:.\-\s]*/i],
+  ['introduction', /^\s*(introduction|background)\b[:.\-\s]*/i],
+  ['methods', /^\s*(methods?|materials and methods|methodology|experimental setup)\b[:.\-\s]*/i],
+  ['results', /^\s*(results?|findings)\b[:.\-\s]*/i],
+  ['discussion', /^\s*(discussion|analysis)\b[:.\-\s]*/i],
+  ['conclusion', /^\s*(conclusions?|future work)\b[:.\-\s]*/i],
+  ['references', /^\s*(references|bibliography|works cited)\b[:.\-\s]*/i],
+];
+
+export const SCIENTIFIC_TEXT_SEPARATORS = [
+  '\nAbstract',
+  '\nABSTRACT',
+  '\nIntroduction',
+  '\nINTRODUCTION',
+  '\nMethods',
+  '\nMaterials and Methods',
+  '\nMethodology',
+  '\nResults',
+  '\nDiscussion',
+  '\nConclusion',
+  '\nReferences',
+  '\n\n',
+  '\n',
+  '. ',
+  ' ',
+  '',
+];
+
+function metadataValueToString(value: unknown): string | undefined {
+  if (typeof value === 'string') {
+    const trimmed = value.trim();
+    return trimmed.length > 0 ? trimmed : undefined;
+  }
+
+  if (typeof value === 'number' || typeof value === 'boolean') {
+    return String(value);
+  }
+
+  return undefined;
+}
+
+export function slugifyCitationPart(value: string): string {
+  const slug = value
+    .normalize('NFKD')
+    .replace(/[\u0300-\u036f]/g, '')
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/^-+|-+$/g, '');
+
+  return slug.length > 0 ? slug.slice(0, 80) : 'source';
+}
+
+export function normalizeTitle(document: ScientificDocumentChunk): string {
+  const metadataTitle = document.metadata?.title;
+  const pdfTitle = document.metadata?.pdf?.info?.Title;
+  const source = document.metadata?.source;
+
+  return (
+    metadataValueToString(metadataTitle) ??
+    metadataValueToString(pdfTitle) ??
+    metadataValueToString(source)?.split(/[\\/]/).pop() ??
+    'Untitled scientific source'
+  );
+}
+
+export function detectScientificSection(text: string, fallback = 'body'): string {
+  const firstLine = text.split('\n').find((line) => line.trim().length > 0) ?? '';
+
+  for (const [section, pattern] of SCIENTIFIC_SECTION_PATTERNS) {
+    if (pattern.test(firstLine)) {
+      return section;
+    }
+  }
+
+  return fallback;
+}
+
+export function createCitationKey(title: string, page: number, chunk: number): string {
+  return `${slugifyCitationPart(title)}:p${Math.max(page, 1)}:c${Math.max(chunk, 1)}`;
+}
+
+export function buildScientificChunkMetadata(
+  document: ScientificDocumentChunk,
+  chunkIndex: number,
+): ScientificChunkMetadata {
+  const metadata = document.metadata ?? {};
+  const title = normalizeTitle(document);
+  const page = Number(metadata.page ?? metadata.loc?.pageNumber ?? 1);
+  const chunk = Number(metadata.chunk ?? chunkIndex + 1);
+  const semanticScholarId = metadataValueToString(metadata.semanticScholarId);
+  const semanticScholarUrl = metadataValueToString(metadata.semanticScholarUrl);
+  const doi = metadataValueToString(metadata.doi);
+  const url = metadataValueToString(metadata.url);
+  const source = metadataValueToString(metadata.source) ?? semanticScholarUrl ?? url ?? title;
+  const sourceType = semanticScholarId || semanticScholarUrl ? 'semantic_scholar' : 'upload';
+  const section =
+    metadataValueToString(metadata.section) ?? detectScientificSection(document.pageContent);
+
+  return {
+    title,
+    page: Number.isFinite(page) && page > 0 ? page : 1,
+    source,
+    sourceType,
+    section,
+    chunk: Number.isFinite(chunk) && chunk > 0 ? chunk : chunkIndex + 1,
+    citationKey: createCitationKey(title, page, chunk),
+    ...(semanticScholarId ? { semanticScholarId } : {}),
+    ...(semanticScholarUrl ? { semanticScholarUrl } : {}),
+    ...(doi ? { doi } : {}),
+    ...(url ? { url } : {}),
+  };
+}
+
+export function formatRetrievedScientificSources(results: ChromaQueryResults): ScientificRetrievedSource[] {
+  const documents = results.documents?.[0] ?? [];
+  const metadatas = results.metadatas?.[0] ?? [];
+  const distances = results.distances?.[0] ?? [];
+
+  return documents.flatMap((content, index) => {
+    if (!content) {
+      return [];
+    }
+
+    const metadata = metadatas[index] ?? {};
+    const title = metadataValueToString(metadata.title) ?? 'Untitled scientific source';
+    const page = Number(metadata.page ?? 1);
+    const chunk = Number(metadata.chunk ?? index + 1);
+    const citationKey =
+      metadataValueToString(metadata.citationKey) ??
+      createCitationKey(title, Number.isFinite(page) ? page : 1, Number.isFinite(chunk) ? chunk : index + 1);
+    const distance = distances[index];
+
+    return [
+      {
+        citationKey,
+        title,
+        page: Number.isFinite(page) && page > 0 ? page : 1,
+        section: metadataValueToString(metadata.section) ?? 'body',
+        content,
+        ...(typeof distance === 'number' ? { distance } : {}),
+        ...(metadataValueToString(metadata.sourceType)
+          ? { sourceType: metadataValueToString(metadata.sourceType) }
+          : {}),
+        ...(metadataValueToString(metadata.semanticScholarId)
+          ? { semanticScholarId: metadataValueToString(metadata.semanticScholarId) }
+          : {}),
+        ...(metadataValueToString(metadata.semanticScholarUrl)
+          ? { semanticScholarUrl: metadataValueToString(metadata.semanticScholarUrl) }
+          : {}),
+        ...(metadataValueToString(metadata.doi) ? { doi: metadataValueToString(metadata.doi) } : {}),
+        ...(metadataValueToString(metadata.url) ? { url: metadataValueToString(metadata.url) } : {}),
+      },
+    ];
+  });
+}
+
+export function formatSourcesForPrompt(sources: ScientificRetrievedSource[]): string {
+  if (sources.length === 0) {
+    return 'No matching sources were retrieved.';
+  }
+
+  return sources
+    .map((source, index) => {
+      const distance =
+        typeof source.distance === 'number' ? `, distance: ${source.distance.toFixed(4)}` : '';
+      const external =
+        source.doi || source.semanticScholarId || source.semanticScholarUrl || source.url
+          ? `, external: ${source.doi ?? source.semanticScholarId ?? source.semanticScholarUrl ?? source.url}`
+          : '';
+
+      return [
+        `Source ${index + 1} [${source.citationKey}]`,
+        `Title: ${source.title}`,
+        `Page: ${source.page}`,
+        `Section: ${source.section}${distance}${external}`,
+        `Content: ${source.content}`,
+      ].join('\n');
+    })
+    .join('\n\n');
+}