DiscourseGraphs · sid597 · Apr 2, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/apps/website/app/api/ai/extract/route.ts b/apps/website/app/api/ai/extract/route.ts
@@ -0,0 +1,201 @@
+import { NextRequest, NextResponse } from "next/server";
+import {
+  ExtractionRequestSchema,
+  EXTRACTION_RESULT_JSON_SCHEMA,
+  type ExtractionResponse,
+  type ProviderId,
+} from "~/types/extraction";
+import type { LLMProviderConfig, Message, Settings } from "~/types/llm";
+import {
+  anthropicConfig,
+  openaiConfig,
+  geminiConfig,
+} from "~/utils/llm/providers";
+import {
+  DEFAULT_EXTRACTION_PROMPT,
+  buildUserPrompt,
+} from "~/prompts/extraction";
+import { parseExtractionResponse } from "~/utils/ai/parseExtractionResponse";
+
+export const runtime = "nodejs";
+export const maxDuration = 300;
+
+const PROVIDER_CONFIGS: Record<ProviderId, LLMProviderConfig> = {
+  anthropic: anthropicConfig,
+  openai: openaiConfig,
+  gemini: geminiConfig,
+};
+
+const buildExtractionMessages = ({
+  provider,
+  pdfBase64,
+  userPrompt,
+}: {
+  provider: ProviderId;
+  pdfBase64: string;
+  userPrompt: string;
+}): Message[] => {
+  switch (provider) {
+    case "anthropic":
+      return [
+        {
+          role: "user",
+          content: [
+            {
+              type: "document",
+              source: {
+                type: "base64",
+                media_type: "application/pdf", // eslint-disable-line @typescript-eslint/naming-convention
+                data: pdfBase64,
+              },
+            },
+            { type: "text", text: userPrompt },
+          ],
+        },
+      ];
+    case "openai":
+      return [
+        {
+          role: "user",
+          content: [
+            {
+              type: "file",
+              file: {
+                filename: "paper.pdf",
+                file_data: `data:application/pdf;base64,${pdfBase64}`, // eslint-disable-line @typescript-eslint/naming-convention
+              },
+            },
+            { type: "text", text: userPrompt },
+          ],
+        },
+      ];
+    case "gemini":
+      return [
+        {
+          role: "user",
+          content: [
+            {
+              inlineData: {
+                mimeType: "application/pdf",
+                data: pdfBase64,
+              },
+            },
+            { text: userPrompt },
+          ],
+        },
+      ];
+  }
+};
+
+export const POST = async (
+  request: NextRequest,
+): Promise<NextResponse<ExtractionResponse>> => {
+  let body: unknown;
+  try {
+    body = await request.json();
+  } catch {
+    return NextResponse.json(
+      { success: false, error: "Invalid JSON body" },
+      { status: 400 },
+    );
+  }
+
+  const validated = ExtractionRequestSchema.safeParse(body);
+  if (!validated.success) {
+    return NextResponse.json(
+      { success: false, error: validated.error.message },
+      { status: 400 },
+    );
+  }
+
+  const { pdfBase64, researchQuestion, model, provider, systemPrompt } =
+    validated.data;
+
+  const config = PROVIDER_CONFIGS[provider];
+  const apiKey = process.env[config.apiKeyEnvVar];
+
+  if (!apiKey) {
+    return NextResponse.json(
+      { success: false, error: `API key not configured for ${provider}.` },
+      { status: 500 },
+    );
+  }
+
+  const messages = buildExtractionMessages({
+    provider,
+    pdfBase64,
+    userPrompt: buildUserPrompt(researchQuestion),
+  });
+
+  const settings: Settings = {
+    model,
+    maxTokens: 16384,
+    temperature: 0.6,
+    systemPrompt: systemPrompt ?? DEFAULT_EXTRACTION_PROMPT,
+    outputSchema: EXTRACTION_RESULT_JSON_SCHEMA,
+  };
+
+  const apiUrl =
+    typeof config.apiUrl === "function"
+      ? config.apiUrl(settings)
+      : config.apiUrl;
+
+  try {
+    const response = await fetch(apiUrl, {
+      method: "POST",
+      headers: config.apiHeaders(apiKey),
+      body: JSON.stringify(config.formatRequestBody(messages, settings)),
+      signal: AbortSignal.timeout(270_000),
+    });
+
+    if (!response.ok) {
+      const errorText = await response.text().catch(() => "");
+      return NextResponse.json(
+        {
+          success: false,
+          error: `${provider} API error (${response.status}): ${errorText.slice(0, 200)}`,
+        },
+        { status: 502 },
+      );
+    }
+
+    const responseData: unknown = await response.json();
+    const rawText = config.extractResponseText(responseData);
+
+    if (!rawText) {
+      return NextResponse.json(
+        { success: false, error: `Empty response from ${provider}` },
+        { status: 502 },
+      );
+    }
+
+    let result;
+    try {
+      result = parseExtractionResponse(rawText);
+    } catch (parseError) {
+      const message =
+        parseError instanceof SyntaxError
+          ? "LLM returned invalid JSON"
+          : "LLM returned unexpected response structure";
+      return NextResponse.json(
+        {
+          success: false,
+          error: `Failed to parse extraction response — ${message}`,
+        },
+        { status: 502 },
+      );
+    }
+
+    return NextResponse.json({ success: true, data: result });
+  } catch (error) {
+    const message =
+      error instanceof Error
+        ? `Extraction failed — ${error.message}`
+        : "Extraction failed";
+    console.error("AI extraction failed:", error);
+    return NextResponse.json(
+      { success: false, error: message },
+      { status: 500 },
+    );
+  }
+};
diff --git a/apps/website/app/prompts/extraction.ts b/apps/website/app/prompts/extraction.ts
@@ -0,0 +1,59 @@
+export const DEFAULT_EXTRACTION_PROMPT = `You are a research analyst extracting discourse graph nodes from academic papers.
+
+Extract discrete, atomic nodes from the paper. Each node is one idea: one claim, one observation, one question.
+
+## Node Types
+
+- **Evidence**: A discrete observation from a published dataset or experiment cited in the paper (prior work). Past tense. Includes observable, model system, method. Quantitative details when available.
+- **Claim**: An interpretive assertion by the authors. Debatable — goes beyond data to state what it means. Specific enough to test or argue against.
+- **Question**: A research question — explicitly stated or implied by a gap in the literature. Open-ended.
+- **Result**: A discrete observation from this paper's own experiments. Same structure as Evidence but from the current work, not prior studies. Past tense.
+- **Theory**: A theoretical framework or model used or proposed. Name it, state its core proposition.
+- **Source**: A cited publication. Author(s) and year.
+
+## Quality
+
+- Atomic: one idea per node. Split compound sentences.
+- Self-contained: understandable without the paper.
+- Faithful: no inference or editorializing.
+- Specific: "X reduced Y by 43% in Z" not "X was effective."
+- 8–25 nodes. Quality over quantity. Cover all sections.
+- Evidence = prior work cited. Result = this paper's experiments.
+
+## Example
+
+Excerpt (Results):
+"CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001). This correlated with elevated CD62L and CCR7 (Fig 3B), suggesting a memory-like phenotype resisting exhaustion."
+
+{
+  "nodes": [
+    {
+      "nodeType": "Result",
+      "content": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro, significantly longer than unedited controls which declined after week 4",
+      "supportSnippet": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001)",
+      "sourceSection": "Results"
+    },
+    {
+      "nodeType": "Result",
+      "content": "Sustained cytotoxic activity of CRISPR-edited T cells correlated with elevated CD62L and CCR7 expression",
+      "supportSnippet": "This correlated with elevated CD62L and CCR7 (Fig 3B)",
+      "sourceSection": "Results"
+    },
+    {
+      "nodeType": "Claim",
+      "content": "CRISPR editing may promote a memory-like T cell phenotype that resists exhaustion",
+      "supportSnippet": "suggesting a memory-like phenotype resisting exhaustion",
+      "sourceSection": "Results"
+    }
+  ]
+}`;
+
+export const buildUserPrompt = (researchQuestion?: string): string => {
+  let prompt = "Extract discourse graph nodes from the attached paper.";
+
+  if (researchQuestion) {
+    prompt += `\n\nFocus extraction around this research question: ${researchQuestion}`;
+  }
+
+  return prompt;
+};
diff --git a/apps/website/app/types/extraction.ts b/apps/website/app/types/extraction.ts
@@ -0,0 +1,59 @@
+import { z } from "zod";
+
+export const PROVIDER_IDS = ["anthropic", "openai", "gemini"] as const;
+
+export type ProviderId = (typeof PROVIDER_IDS)[number];
+
+// eslint-disable-next-line @typescript-eslint/naming-convention
+export const ExtractedNodeSchema = z.object({
+  nodeType: z.string(),
+  content: z.string(),
+  supportSnippet: z.string(),
+  sourceSection: z.string().nullable(),
+});
+
+export type ExtractedNode = z.infer<typeof ExtractedNodeSchema>;
+
+// eslint-disable-next-line @typescript-eslint/naming-convention
+export const ExtractionResultSchema = z.object({
+  nodes: z.array(ExtractedNodeSchema),
+});
+
+export type ExtractionResult = z.infer<typeof ExtractionResultSchema>;
+
+// eslint-disable-next-line @typescript-eslint/naming-convention
+export const ExtractionRequestSchema = z.object({
+  pdfBase64: z.string().min(1).max(44_000_000),
+  provider: z.enum(PROVIDER_IDS),
+  model: z.string().min(1),
+  researchQuestion: z.string().optional(),
+  systemPrompt: z.string().optional(),
+});
+
+export type ExtractionRequest = z.infer<typeof ExtractionRequestSchema>;
+
+export const EXTRACTION_RESULT_JSON_SCHEMA: Record<string, unknown> = {
+  type: "object",
+  properties: {
+    nodes: {
+      type: "array",
+      items: {
+        type: "object",
+        properties: {
+          nodeType: { type: "string" },
+          content: { type: "string" },
+          supportSnippet: { type: "string" },
+          sourceSection: { type: ["string", "null"] },
+        },
+        required: ["nodeType", "content", "supportSnippet", "sourceSection"],
+        additionalProperties: false,
+      },
+    },
+  },
+  required: ["nodes"],
+  additionalProperties: false,
+};
+
+export type ExtractionResponse =
+  | { success: true; data: ExtractionResult }
+  | { success: false; error: string };
diff --git a/apps/website/app/types/llm.ts b/apps/website/app/types/llm.ts
@@ -1,12 +1,16 @@
+export type ContentBlock = Record<string, unknown>;
+
 export type Message = {
   role: string;
-  content: string;
+  content: string | ContentBlock[];
 };
 
 export type Settings = {
   model: string;
   maxTokens: number;
   temperature: number;
+  systemPrompt?: string;
+  outputSchema?: Record<string, unknown>;
   safetySettings?: Array<{
     category: string;
     threshold: string;

diff --git a/apps/website/app/utils/ai/parseExtractionResponse.ts b/apps/website/app/utils/ai/parseExtractionResponse.ts
@@ -0,0 +1,9 @@
+import {
+  ExtractionResultSchema,
+  type ExtractionResult,
+} from "~/types/extraction";
+
+export const parseExtractionResponse = (raw: string): ExtractionResult => {
+  const parsed: unknown = JSON.parse(raw);
+  return ExtractionResultSchema.parse(parsed);
+};