Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions ui/__tests__/utils/server/scientific-rag.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import {
buildScientificChunkMetadata,
createCitationKey,
detectScientificSection,
formatRetrievedScientificSources,
formatSourcesForPrompt,
slugifyCitationPart,
} from '@/utils/server/scientific-rag';

import { describe, expect, it } from 'vitest';

describe('scientific RAG helpers', () => {
it('detects common scientific sections from chunk text', () => {
expect(detectScientificSection('Abstract\nWe study retrieval.')).toBe('abstract');
expect(detectScientificSection('Materials and Methods: cohort details')).toBe('methods');
expect(detectScientificSection('Results\nAccuracy improved.')).toBe('results');
expect(detectScientificSection('Unlabeled paragraph')).toBe('body');
});

it('creates compact stable citation keys', () => {
expect(slugifyCitationPart('A Study: Retrieval & Citations!')).toBe(
'a-study-retrieval-citations',
);
expect(createCitationKey('A Study: Retrieval & Citations!', 3, 2)).toBe(
'a-study-retrieval-citations:p3:c2',
);
});

it('preserves upload and Semantic Scholar metadata for indexed chunks', () => {
const uploaded = buildScientificChunkMetadata(
{
pageContent: 'Discussion\nScientific context.',
metadata: {
source: '/tmp/paper.pdf',
pdf: { info: { Title: 'Paper Title' } },
loc: { pageNumber: 8 },
},
},
0,
);

expect(uploaded).toMatchObject({
title: 'Paper Title',
page: 8,
section: 'discussion',
sourceType: 'upload',
citationKey: 'paper-title:p8:c1',
});

const reference = buildScientificChunkMetadata(
{
pageContent: 'Abstract\nReference content.',
metadata: {
title: 'Scholar Paper',
semanticScholarId: 'S2-123',
semanticScholarUrl: 'https://semanticscholar.org/paper/S2-123',
doi: '10.1000/example',
page: 2,
},
},
3,
);

expect(reference).toMatchObject({
title: 'Scholar Paper',
sourceType: 'semantic_scholar',
semanticScholarId: 'S2-123',
doi: '10.1000/example',
citationKey: 'scholar-paper:p2:c4',
});
});

it('formats Chroma results into citation-first prompt context', () => {
const sources = formatRetrievedScientificSources({
documents: [['Claim one', 'Claim two']],
metadatas: [
[
{
title: 'Paper A',
page: 1,
chunk: 1,
section: 'methods',
citationKey: 'paper-a:p1:c1',
doi: '10.1000/a',
},
{
title: 'Paper B',
page: 4,
chunk: 2,
section: 'results',
},
],
],
distances: [[0.12, 0.34]],
});

expect(sources).toHaveLength(2);
expect(sources[0]).toMatchObject({
citationKey: 'paper-a:p1:c1',
distance: 0.12,
doi: '10.1000/a',
});
expect(sources[1].citationKey).toBe('paper-b:p4:c2');

const promptContext = formatSourcesForPrompt(sources);

expect(promptContext).toContain('Source 1 [paper-a:p1:c1]');
expect(promptContext).toContain('distance: 0.1200');
expect(promptContext).toContain('Section: results');
});
});
29 changes: 24 additions & 5 deletions ui/pages/api/fetch-documents.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,44 @@
import type { NextApiRequest, NextApiResponse } from "next";

import {
formatRetrievedScientificSources,
formatSourcesForPrompt,
} from '@/utils/server/scientific-rag';
import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";

export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
if (req.method !== 'POST') {
return res.status(405).end();
}

const client = new ChromaClient({
path: "http://chroma-server:8000",
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const query = req.body.input;
const query = String(req.body.input ?? '').trim();

if (!query) {
return res.status(400).json({ error: 'A retrieval query is required' });
}

const embedder = new TransformersEmbeddingFunction();

const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });

// query the collection
const results = await collection.query({
nResults: 4,
nResults: Math.min(Number(req.body.nResults ?? 6), 10),
queryTexts: [query]
})

res.status(200).json(results);
const sources = formatRetrievedScientificSources(results);

res.status(200).json({
...results,
sources,
promptContext: formatSourcesForPrompt(sources),
});
} catch (error) {
if (error instanceof Error) {
console.error('Error message:', error.message);
Expand All @@ -29,4 +48,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}
res.status(500).json({ error: 'An unexpected error occurred :(' });
}
}
}
53 changes: 24 additions & 29 deletions ui/pages/api/inject-documents.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import type { NextApiRequest, NextApiResponse } from 'next';

import {
SCIENTIFIC_TEXT_SEPARATORS,
ScientificDocumentChunk,
buildScientificChunkMetadata,
} from '@/utils/server/scientific-rag';
import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
import { IncomingForm } from 'formidable';
import type { File } from 'formidable';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';

import path from 'path';
import { v4 as uuidv4 } from 'uuid';

export const config = {
Expand Down Expand Up @@ -33,17 +38,21 @@ export default async function handler(
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const loader = new PDFLoader(files.pdf[0].filepath);
const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : (files.pdf as File | undefined);

const originalDocs = await loader.load();
if (!pdfFile?.filepath) {
return res.status(400).json({ error: 'A PDF file is required' });
}

console.log(JSON.stringify(originalDocs));
const loader = new PDFLoader(pdfFile.filepath);

const originalDocs = await loader.load();

const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 100,
});
chunkSize: 900,
chunkOverlap: 150,
separators: SCIENTIFIC_TEXT_SEPARATORS,
});

const docs = await splitter.splitDocuments(originalDocs);

Expand Down Expand Up @@ -75,32 +84,18 @@ export default async function handler(
}
}

function processDocuments(docs: any) {
const ids = [];
const metadatas = [];
const documentContents = [];
function processDocuments(docs: ScientificDocumentChunk[]) {
const ids: string[] = [];
const metadatas: Array<Record<string, string | number | boolean>> = [];
const documentContents: string[] = [];

for (const document of docs) {
// Generate an ID for each document, or use some existing unique identifier
docs.forEach((document, index) => {
const id = uuidv4();
ids.push(id);

const fallbackTitle = path.basename(document.metadata.source);
const titleFromMetadata = document.metadata.pdf.info.Title;

const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;


const metadata = {
title: title,
page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
source: document.metadata.source, // Define this function to extract verse info
};
metadatas.push(metadata);

// Add the page content to the documents array
metadatas.push(buildScientificChunkMetadata(document, index));
documentContents.push(document.pageContent);
}
});

return { ids, metadatas, documentContents };
}
29 changes: 18 additions & 11 deletions ui/pages/api/rag-chat.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const';
import { OpenAIError, OpenAIStream } from '@/utils/server';
import {
ScientificRetrievedSource,
formatSourcesForPrompt,
} from '@/utils/server/scientific-rag';
import { codeBlock, oneLine } from 'common-tags'

import { ChatBody, Message } from '@/types/chat';
Expand All @@ -17,25 +21,23 @@ export const config = {
// Function to fetch and format documents
async function fetchAndFormatDocuments(lastMessageContent: string) {
try {
console.log("fetching documents")
const response = await fetch('http://localhost:3000/api/fetch-documents', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ input: lastMessageContent }),
body: JSON.stringify({ input: lastMessageContent, nResults: 6 }),
});

if (!response.ok) {
throw new Error(`Error fetching documents: ${response.statusText}`);
}

const data = await response.json();
const result = data.metadatas[0].map((metadata: any, index: number) => {
return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`;
}).join('');

console.log(result);
if (Array.isArray(data.sources)) {
return formatSourcesForPrompt(data.sources as ScientificRetrievedSource[]);
}

return result;
return data.promptContext ?? 'No matching sources were retrieved.';

} catch (error) {
console.error('Error fetching and formatting documents:', error);
Expand Down Expand Up @@ -64,8 +66,9 @@ const handler = async (req: Request): Promise<Response> => {
${oneLine`
You are a very enthusiastic AI assistant who loves
to help people! Given the following information from
relevant documentation, answer the user's question using
only that information, outputted in markdown format.
relevant scientific documentation, answer the user's
question using only that information, outputted in
markdown format.
`}

${oneLine`
Expand All @@ -75,7 +78,8 @@ const handler = async (req: Request): Promise<Response> => {
`}

${oneLine`
Always include citations from the documentation.
Every factual claim must include one or more citation
keys in square brackets exactly as provided in the sources.
`}
`;

Expand Down Expand Up @@ -130,7 +134,10 @@ const handler = async (req: Request): Promise<Response> => {
- Prefer splitting your response into multiple paragraphs.
`}
${oneLine`
- Output as markdown with citations based on the documentation.
- Output as markdown with citation keys from the source list.
`}
${oneLine`
- Prefer lower-distance sources when sources disagree.
`}
`,
},
Expand Down
Loading