From 378594cd597b44750c4d781e48bcd97ed859f7f3 Mon Sep 17 00:00:00 2001
From: SakshiKekre <sakshi.kekre@gmail.com>
Date: Wed, 1 Apr 2026 22:12:48 -0700
Subject: [PATCH] Add llms.txt route handlers to Next.js website

Port llms.txt generation from Vite build script (app/scripts/generate-llms-txt.ts)
to Next.js route handlers. Restores /llms.txt, /llms-full.txt, /llms-recent.txt,
/llms-research-us.txt, /llms-research-uk.txt which have been 404ing since the
website domain cutover to Next.js.

Core transformation logic (chart summaries, iframe placeholders, notebook extraction)
copied from the Vite script with minimal changes to use existing website modules
(getPostsSorted, getArticleContent, authors.json import) instead of standalone
fs/path scaffolding.
---
 website/src/app/llms-full.txt/route.ts        |   7 +
 website/src/app/llms-recent.txt/route.ts      |   7 +
 website/src/app/llms-research-uk.txt/route.ts |   7 +
 website/src/app/llms-research-us.txt/route.ts |   7 +
 website/src/app/llms.txt/route.ts             |   7 +
 website/src/lib/llmsTxt.ts                    | 247 ++++++++++++++++++
 6 files changed, 282 insertions(+)
 create mode 100644 website/src/app/llms-full.txt/route.ts
 create mode 100644 website/src/app/llms-recent.txt/route.ts
 create mode 100644 website/src/app/llms-research-uk.txt/route.ts
 create mode 100644 website/src/app/llms-research-us.txt/route.ts
 create mode 100644 website/src/app/llms.txt/route.ts
 create mode 100644 website/src/lib/llmsTxt.ts

diff --git a/website/src/app/llms-full.txt/route.ts b/website/src/app/llms-full.txt/route.ts
new file mode 100644
index 000000000..62b45e24b
--- /dev/null
+++ b/website/src/app/llms-full.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateArticleFile } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateArticleFile(), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/app/llms-recent.txt/route.ts b/website/src/app/llms-recent.txt/route.ts
new file mode 100644
index 000000000..09124ef3f
--- /dev/null
+++ b/website/src/app/llms-recent.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateArticleFile } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateArticleFile(undefined, 50), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/app/llms-research-uk.txt/route.ts b/website/src/app/llms-research-uk.txt/route.ts
new file mode 100644
index 000000000..a65cd6674
--- /dev/null
+++ b/website/src/app/llms-research-uk.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateArticleFile } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateArticleFile("uk"), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/app/llms-research-us.txt/route.ts b/website/src/app/llms-research-us.txt/route.ts
new file mode 100644
index 000000000..b19b79255
--- /dev/null
+++ b/website/src/app/llms-research-us.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateArticleFile } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateArticleFile("us"), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/app/llms.txt/route.ts b/website/src/app/llms.txt/route.ts
new file mode 100644
index 000000000..66ac0bb87
--- /dev/null
+++ b/website/src/app/llms.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateIndex } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateIndex(), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/lib/llmsTxt.ts b/website/src/lib/llmsTxt.ts
new file mode 100644
index 000000000..07ed2e818
--- /dev/null
+++ b/website/src/lib/llmsTxt.ts
@@ -0,0 +1,247 @@
+/**
+ * Shared logic for generating llms.txt content.
+ *
+ * Ported from app/scripts/generate-llms-txt.ts with minimal changes:
+ * - Replaced standalone fs/path scaffolding with existing website modules
+ * - Core transformation logic (charts, iframes, notebooks) is unchanged
+ */
+
+import { getPostsSorted, type BlogPost } from "@/data/posts/postTransformers";
+import { getArticleContent } from "@/lib/articles";
+import authorsData from "@/data/posts/authors.json";
+import type { AuthorsCollection } from "@/types/blog";
+
+const authors = authorsData as AuthorsCollection;
+
+/**
+ * Extract chart summary from Plotly JSON or figure caption.
+ * Returns a text description instead of the full JSON.
+ */
+function extractChartSummary(plotlyJson: string, caption?: string): string {
+  if (caption) {
+    return `[Chart: ${caption}]`;
+  }
+
+  try {
+    const chart = JSON.parse(plotlyJson);
+    const parts: string[] = [];
+
+    // Extract axis labels
+    if (chart.layout?.xaxis?.title?.text) {
+      parts.push(`x-axis: ${chart.layout.xaxis.title.text}`);
+    }
+    if (chart.layout?.yaxis?.title?.text) {
+      parts.push(`y-axis: ${chart.layout.yaxis.title.text}`);
+    }
+
+    // Extract chart type
+    if (chart.data?.[0]?.type) {
+      parts.push(`type: ${chart.data[0].type}`);
+    }
+
+    if (parts.length > 0) {
+      return `[Chart: ${parts.join(", ")}]`;
+    }
+  } catch {
+    // Failed to parse, use generic placeholder
+  }
+
+  return "[Chart: see original article]";
+}
+
+/**
+ * Transform article content for LLM consumption:
+ * - Replace Plotly JSON with text summaries
+ * - Keep iframe descriptions but remove the HTML
+ * - Preserve tables and text content
+ */
+function transformArticleContent(content: string): string {
+  let result = content;
+
+  // Find figure captions before plotly blocks
+  // Pattern: **Figure N: Caption**\n\n```plotly
+  const figurePattern =
+    /\*\*(?:Figure|Table)\s*\d*:?\s*([^*]+)\*\*\s*\n+```plotly\n([\s\S]*?)```/g;
+  result = result.replace(figurePattern, (_, caption, json) => {
+    const summary = extractChartSummary(json.trim(), caption.trim());
+    return `**${caption.trim()}**\n\n${summary}`;
+  });
+
+  // Handle plotly blocks without preceding captions
+  const plotlyPattern = /```plotly\n([\s\S]*?)```/g;
+  result = result.replace(plotlyPattern, (_, json) =>
+    extractChartSummary(json.trim()),
+  );
+
+  // Transform iframes to descriptions
+  const iframePattern =
+    /<iframe[^>]*src="([^"]*)"[^>]*(?:title="([^"]*)")?[^>]*><\/iframe>/g;
+  result = result.replace(iframePattern, (_, src, title) => {
+    if (title) {
+      return `[Interactive: ${title}]`;
+    }
+    // Extract meaningful part of URL
+    const urlParts = (src as string).split("/").filter(Boolean);
+    const lastPart = urlParts[urlParts.length - 1]?.replace(".html", "") || "";
+    const readable = lastPart.replace(/-/g, " ");
+    return `[Interactive: ${readable || "see original article"}]`;
+  });
+
+  return result;
+}
+
+/**
+ * Extract text content from Jupyter notebook JSON.
+ */
+function extractNotebookContent(raw: string): string {
+  try {
+    const nb = JSON.parse(raw) as {
+      cells: Array<{
+        cell_type: string;
+        source: string[];
+        outputs?: Array<{
+          output_type: string;
+          text?: string[];
+          data?: Record<string, string[]>;
+        }>;
+      }>;
+    };
+    const parts: string[] = [];
+
+    for (const cell of nb.cells) {
+      const source = cell.source.join("");
+      if (cell.cell_type === "markdown") {
+        parts.push(source);
+      } else if (cell.cell_type === "code") {
+        parts.push(`\`\`\`python\n${source}\n\`\`\``);
+        // Include text output if present
+        if (cell.outputs) {
+          for (const output of cell.outputs) {
+            if (output.text) {
+              parts.push(`\`\`\`\n${output.text.join("")}\n\`\`\``);
+            } else if (output.data?.["text/plain"]) {
+              parts.push(
+                `\`\`\`\n${output.data["text/plain"].join("")}\n\`\`\``,
+              );
+            }
+          }
+        }
+      }
+    }
+
+    return parts.join("\n\n");
+  } catch {
+    return raw;
+  }
+}
+
+/**
+ * Format a single article for llms.txt output.
+ */
+function formatArticle(post: BlogPost): string {
+  const authorNames = post.authors
+    .map((id) => authors[id]?.name || id)
+    .join(", ");
+
+  const rawContent = getArticleContent(post.filename);
+  const content = post.filename.endsWith(".ipynb")
+    ? extractNotebookContent(rawContent)
+    : rawContent;
+  const transformedContent = transformArticleContent(content);
+
+  return `---
+# ${post.title}
+Slug: ${post.slug}
+Date: ${post.date}
+Authors: ${authorNames}
+Tags: ${post.tags.join(", ")}
+Description: ${post.description}
+---
+
+${transformedContent}
+`;
+}
+
+/**
+ * Generate the main llms.txt index file.
+ */
+export function generateIndex(): string {
+  const posts = getPostsSorted();
+  const usPosts = posts.filter((p) => p.tags.includes("us"));
+  const ukPosts = posts.filter((p) => p.tags.includes("uk"));
+  const recentPosts = posts.slice(0, 10);
+
+  return `# PolicyEngine Research
+
+> PolicyEngine is a free, open-source tool for analyzing tax and benefit policy impacts through microsimulation modeling. We provide household calculators and society-wide impact analysis for the US and UK.
+
+## About PolicyEngine
+
+PolicyEngine enables users to:
+- Calculate how policy changes affect individual households
+- Estimate society-wide impacts on revenue, poverty, and inequality
+- Compare reform proposals across different scenarios
+- Access programmatic policy analysis via API
+
+## Recent Research
+
+${recentPosts.map((p) => `- [${p.title}](/research/${p.slug}): ${p.description}`).join("\n")}
+
+## Research by Region
+
+- [US Research](/llms-research-us.txt): ${usPosts.length} articles on US federal and state policy
+- [UK Research](/llms-research-uk.txt): ${ukPosts.length} articles on UK tax and benefit policy
+
+## Recent Research (Full Text)
+
+- [Recent Articles](/llms-recent.txt): Last 50 articles with full text
+
+## Full Archive
+
+- [All Research](/llms-full.txt): Complete archive of all PolicyEngine research articles
+
+## Documentation
+
+- [API Documentation](https://policyengine.org/us/api): Programmatic access to PolicyEngine
+- [Python Package](https://policyengine.github.io/policyengine-us/): policyengine-us documentation
+
+## Contact
+
+- Website: https://policyengine.org
+- GitHub: https://github.com/PolicyEngine
+- Email: hello@policyengine.org
+`;
+}
+
+/**
+ * Generate combined article file for a region, recent subset, or full archive.
+ */
+export function generateArticleFile(
+  region?: "us" | "uk",
+  limit?: number,
+): string {
+  const posts = getPostsSorted();
+  let filtered = region ? posts.filter((p) => p.tags.includes(region)) : posts;
+
+  let header: string;
+  if (limit) {
+    filtered = filtered.slice(0, limit);
+    header = `# PolicyEngine Recent Research\n\n> The ${filtered.length} most recent PolicyEngine research articles.\n\n`;
+  } else if (region) {
+    const regionLabel = region === "us" ? "US federal and state" : "UK";
+    header = `# PolicyEngine ${region.toUpperCase()} Research\n\n> ${regionLabel} tax and benefit policy analysis.\n\n`;
+  } else {
+    header = `# PolicyEngine Research Archive\n\n> Complete archive of PolicyEngine research articles.\n\n`;
+  }
+
+  const articles: string[] = [];
+  for (const post of filtered) {
+    try {
+      articles.push(formatArticle(post));
+    } catch {
+      // Skip articles whose files are missing
+    }
+  }
+
+  return header + articles.join("\n\n---\n\n");
+}