Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

81 changes: 28 additions & 53 deletions make-pdf/src/render.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
*/

import { marked } from "marked";
import sanitizeHtml from "sanitize-html";
import { smartypants } from "./smartypants";
import { printCss, type PrintCssOptions } from "./print-css";

Expand Down Expand Up @@ -170,60 +171,34 @@ function decodeTypographicEntities(html: string): string {
/**
* Strip dangerous HTML from markdown-produced output.
*
* We can't use DOMPurify (server-side; adds a jsdom dep). A conservative
* regex sanitizer is fine for this use case because:
* 1. marked produces structured HTML (never malformed)
* 2. we only need to strip a fixed blacklist of elements + attrs
* 3. the output goes through Chromium's parser again, which normalizes
*
* What's stripped:
* - <script>, <iframe>, <object>, <embed>, <link>, <meta>, <base>, <form>
* (and their content).
* - on* event handler attributes (onclick, ONCLICK, etc.).
* - href/src with javascript: scheme.
* - <svg> tags with <script> inside them.
* Use a parser-backed sanitizer instead of regex matching. Regex-based HTML
* filtering is brittle and can be bypassed by malformed-but-browser-accepted
* markup.
*/
export function sanitizeUntrustedHtml(html: string): string {
let s = html;

// Elements to remove entirely (including content).
const DANGER_TAGS = [
"script", "iframe", "object", "embed", "link", "meta", "base", "form",
"applet", "frame", "frameset",
];
for (const tag of DANGER_TAGS) {
const re = new RegExp(`<${tag}\\b[\\s\\S]*?</${tag}>`, "gi");
s = s.replace(re, "");
// Self-closing / unclosed variants
const selfRe = new RegExp(`<${tag}\\b[^>]*/?>`, "gi");
s = s.replace(selfRe, "");
}

// SVG <script>
s = s.replace(/<svg([^>]*)>([\s\S]*?)<\/svg>/gi, (_, attrs, body) => {
return `<svg${attrs}>${body.replace(/<script\b[\s\S]*?<\/script>/gi, "")}</svg>`;
});

// Event handler attributes (on* in any case).
s = s.replace(/\s+on[a-zA-Z]+\s*=\s*"[^"]*"/gi, "");
s = s.replace(/\s+on[a-zA-Z]+\s*=\s*'[^']*'/gi, "");
s = s.replace(/\s+on[a-zA-Z]+\s*=\s*[^\s>]+/gi, "");
const SANITIZE_OPTIONS: sanitizeHtml.IOptions = {
// Keep common markdown output tags only; drop active/embedding content.
allowedTags: [
...sanitizeHtml.defaults.allowedTags,
"img",
"h1", "h2", "h3", "h4", "h5", "h6",
"section", "figure", "figcaption",
"table", "thead", "tbody", "tr", "th", "td",
],
allowedAttributes: {
a: ["href", "name", "target", "rel", "title"],
img: ["src", "alt", "title", "width", "height"],
'*': ["id", "class", "lang", "dir", "align"],
},
allowedSchemes: ["http", "https", "mailto", "tel", "data"],
allowedSchemesByTag: {
img: ["http", "https", "data"],
},
allowProtocolRelative: false,
disallowedTagsMode: "discard",
};

// javascript: URLs in href/src/action/formaction
s = s.replace(
/(\s(?:href|src|action|formaction|xlink:href)\s*=\s*)(?:"javascript:[^"]*"|'javascript:[^']*'|javascript:[^\s>]+)/gi,
'$1"#"',
);

// srcdoc attribute (iframe escape hatch — already stripped via iframe above,
// but defense-in-depth).
s = s.replace(/\s+srcdoc\s*=\s*"[^"]*"/gi, "");
s = s.replace(/\s+srcdoc\s*=\s*'[^']*'/gi, "");

// style="url(javascript:..)" — strip javascript: inside style attrs.
s = s.replace(/url\(\s*javascript:[^)]*\)/gi, "url(#)");

return s;
export function sanitizeUntrustedHtml(html: string): string {
return sanitizeHtml(html, SANITIZE_OPTIONS);
}

// ─── Cover / TOC / Chapter helpers ────────────────────────────────────
Expand Down Expand Up @@ -353,7 +328,7 @@ function decodeTextEntities(s: string): string {
}

function stripTags(html: string): string {
return html.replace(/<[^>]+>/g, "");
return sanitizeHtml(html, { allowedTags: [], allowedAttributes: {} });
}

function escapeHtml(s: string): string {
Expand Down
10 changes: 6 additions & 4 deletions make-pdf/test/render.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,18 +101,20 @@ describe("sanitizeUntrustedHtml", () => {
expect(sanitizeUntrustedHtml(input2)).not.toContain("ONCLICK");
});

test("rewrites javascript: URLs in href to #", () => {
test("removes javascript: URLs in href", () => {
const input = `<a href="javascript:alert(1)">bad</a>`;
const out = sanitizeUntrustedHtml(input);
expect(out).not.toContain("javascript:");
expect(out).toContain('href="#"');
expect(out).toContain("<a");
expect(out).not.toContain("href=");
});

test("strips inline SVG <script>", () => {
test("drops inline SVG content", () => {
const input = `<svg><script>alert(1)</script><circle r="5"/></svg>`;
const out = sanitizeUntrustedHtml(input);
expect(out).not.toContain("<script");
expect(out).toContain("<circle");
expect(out).not.toContain("<svg");
expect(out).not.toContain("<circle");
});

test("strips <object>, <embed>, <link>, <meta>, <base>, <form>", () => {
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
"marked": "^18.0.2",
"playwright": "^1.58.2",
"puppeteer-core": "^24.40.0",
"sanitize-html": "^2.17.3",
"socks": "^2.8.8"
},
"engines": {
Expand Down