kepano · Jkker · Mar 20, 2026
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ _Note: for `defuddle/node` to import properly, the module format in your `packag
 
 ### CLI
 
-Defuddle includes a command-line interface for parsing web pages directly from the terminal. You can run it with `npx` or [install it globally](#cli-installation).
+Defuddle includes a command-line interface for parsing web pages directly from the terminal. You can run it with `npx` or [install it globally](#cli-installation). The CLI accepts a file path, a URL, or HTML piped over stdin.
 
 ```bash
 # Parse a local HTML file
@@ -74,6 +74,12 @@ npx defuddle parse page.html
 # Parse a URL
 npx defuddle parse https://example.com/article
 
+# Parse HTML from stdin
+cat page.html | npx defuddle parse
+
+# Parse fetched HTML from stdin as markdown
+curl -L https://stephango.com/saw | npx defuddle parse --markdown
+
 # Output as markdown
 npx defuddle parse page.html --markdown
 
@@ -102,6 +108,8 @@ npx defuddle parse page.html --debug
 | `--debug` | | Enable debug mode |
 | `--lang <code>` | `-l` | Preferred language (BCP 47, e.g. `en`, `fr`, `ja`) |
 
+When no `<source>` argument is provided, `defuddle parse` reads HTML from stdin. You can also pass `-` explicitly to force stdin input.
+
 ## Installation
 
 ```bash

diff --git a/src/cli.ts b/src/cli.ts
@@ -8,7 +8,7 @@ import { parseLinkedomHTML } from './utils/linkedom-compat';
 import { countWords } from './utils';
 import { getInitialUA, fetchPage, extractRawMarkdown, cleanMarkdownContent, BOT_UA } from './fetch';
 
-interface ParseOptions {
+export interface ParseOptions {
 	output?: string;
 	markdown?: boolean;
 	md?: boolean;
@@ -18,6 +18,10 @@ interface ParseOptions {
 	lang?: string;
 }
 
+interface ParseResult {
+	output: string;
+}
+
 // ANSI color helpers (avoids chalk dependency which is ESM-only)
 const useColor = process.stdout.isTTY ?? false;
 const ansi = {
@@ -28,134 +32,165 @@ const ansi = {
 // Read version from package.json
 const version = require('../package.json').version;
 
-const program = new Command();
-
-program
-	.name('defuddle')
-	.description('Extract article content from web pages')
-	.version(version);
-
-program
-	.command('parse')
-	.description('Parse HTML content from a file or URL')
-	.argument('<source>', 'HTML file path or URL to parse')
-	.option('-o, --output <file>', 'Output file path (default: stdout)')
-	.option('-m, --markdown', 'Convert content to markdown format')
-	.option('--md', 'Alias for --markdown')
-	.option('-j, --json', 'Output as JSON with metadata and content')
-	.option('-p, --property <name>', 'Extract a specific property (e.g., title, description, domain)')
-	.option('--debug', 'Enable debug mode')
-	.option('-l, --lang <code>', 'Preferred language (BCP 47, e.g. en, fr, ja)')
-	.action(async (source: string, options: ParseOptions) => {
-		try {
-			// Handle --md alias
-			if (options.md) {
-				options.markdown = true;
-			}
+export async function readStdin(input: NodeJS.ReadStream = process.stdin): Promise<string> {
+	return new Promise((resolve, reject) => {
+		const chunks: string[] = [];
+		input.setEncoding('utf8');
+		input.on('data', (chunk: string) => {
+			chunks.push(chunk);
+		});
+		input.on('end', () => resolve(chunks.join('')));
+		input.on('error', reject);
+	});
+}
 
-			const defuddleOpts = {
-				debug: options.debug,
-				markdown: options.markdown,
-				separateMarkdown: options.markdown || options.json,
-				language: options.lang,
-			};
-
-			let html: string;
-			let url: string | undefined;
-
-			// Determine if source is a URL or file path
-			const isUrl = source.startsWith('http://') || source.startsWith('https://');
-			if (isUrl) {
-				url = source;
-				const initialUA = getInitialUA(source);
-				html = await fetchPage(source, initialUA, options.lang);
-			} else {
-				const filePath = resolve(process.cwd(), source);
-				html = await readFile(filePath, 'utf-8');
-			}
+export async function parseSource(source: string | undefined, options: ParseOptions, input: NodeJS.ReadStream = process.stdin): Promise<ParseResult> {
+	// Handle --md alias
+	if (options.md) {
+		options.markdown = true;
+	}
 
-			const doc = parseLinkedomHTML(html);
-			let result = await Defuddle(doc, url, defuddleOpts);
-
-			// If no content was extracted from a URL, retry with bot UA.
-			// Some sites (e.g. Obsidian Publish) serve pre-rendered content to bots.
-			if (isUrl && result.wordCount === 0) {
-				try {
-					const botHtml = await fetchPage(source, BOT_UA, options.lang);
-
-					// Check for raw markdown before DOM parsing destroys whitespace
-					const rawMarkdown = extractRawMarkdown(botHtml);
-					if (rawMarkdown) {
-						const botDoc = parseLinkedomHTML(botHtml);
-						const botResult = await Defuddle(botDoc, url, defuddleOpts);
-						botResult.content = cleanMarkdownContent(rawMarkdown);
-						botResult.wordCount = countWords(botResult.content);
-						result = botResult;
-					} else {
-						const botDoc = parseLinkedomHTML(botHtml);
-						const botResult = await Defuddle(botDoc, url, defuddleOpts);
-						if (botResult.wordCount > 0) {
-							result = botResult;
-						}
-					}
-				} catch {
-					// Bot UA may be blocked — use original result
-				}
-			}
+	const defuddleOpts = {
+		debug: options.debug,
+		markdown: options.markdown,
+		separateMarkdown: options.markdown || options.json,
+		language: options.lang,
+	};
 
-			// Check if parsing produced meaningful content
-			const textContent = result.content.replace(/<[^>]*>/g, '').trim();
-			if (!textContent) {
-				console.error(ansi.red(`Error: No content could be extracted from ${source}`));
-				process.exit(1);
-			}
+	let html: string;
+	let url: string | undefined;
 
-			// Format output
-			let output: string;
+	const usesStdin = !source || source === '-';
+	const isUrl = !usesStdin && (source.startsWith('http://') || source.startsWith('https://'));
 
-			if (options.property) {
-				const property = options.property;
-				if (property in result) {
-					output = result[property as keyof typeof result]?.toString() || '';
-				} else {
-					console.error(ansi.red(`Error: Property "${property}" not found in response`));
-					process.exit(1);
-				}
-			} else if (options.json) {
-				output = JSON.stringify({
-					content: result.content,
-					title: result.title,
-					description: result.description,
-					domain: result.domain,
-					favicon: result.favicon,
-					image: result.image,
-					language: result.language,
-					metaTags: result.metaTags,
-					parseTime: result.parseTime,
-					published: result.published,
-					author: result.author,
-					site: result.site,
-					schemaOrgData: result.schemaOrgData,
-					wordCount: result.wordCount,
-					...(result.contentMarkdown ? { contentMarkdown: result.contentMarkdown } : {}),
-					...(result.variables ? { variables: result.variables } : {}),
-				}, null, 2);
+	if (usesStdin) {
+		if (input.isTTY) {
+			throw new Error('No input source provided. Pass a file path or URL, or pipe HTML to stdin.');
+		}
+		html = await readStdin(input);
+	} else if (isUrl) {
+		url = source;
+		const initialUA = getInitialUA(source);
+		html = await fetchPage(source, initialUA, options.lang);
+	} else {
+		const filePath = resolve(process.cwd(), source);
+		html = await readFile(filePath, 'utf-8');
+	}
+
+	const doc = parseLinkedomHTML(html);
+	let result = await Defuddle(doc, url, defuddleOpts);
+
+	// If no content was extracted from a URL, retry with bot UA.
+	// Some sites (e.g. Obsidian Publish) serve pre-rendered content to bots.
+	if (isUrl && result.wordCount === 0) {
+		try {
+			const botHtml = await fetchPage(source, BOT_UA, options.lang);
+
+			// Check for raw markdown before DOM parsing destroys whitespace
+			const rawMarkdown = extractRawMarkdown(botHtml);
+			if (rawMarkdown) {
+				const botDoc = parseLinkedomHTML(botHtml);
+				const botResult = await Defuddle(botDoc, url, defuddleOpts);
+				botResult.content = cleanMarkdownContent(rawMarkdown);
+				botResult.wordCount = countWords(botResult.content);
+				result = botResult;
 			} else {
-				output = result.content;
+				const botDoc = parseLinkedomHTML(botHtml);
+				const botResult = await Defuddle(botDoc, url, defuddleOpts);
+				if (botResult.wordCount > 0) {
+					result = botResult;
+				}
 			}
+		} catch {
+			// Bot UA may be blocked — use original result
+		}
+	}
+
+	// Check if parsing produced meaningful content
+	const textContent = parseLinkedomHTML(`<!DOCTYPE html><html><body>${result.content}</body></html>`)
+		.body.textContent?.trim() || '';
+	if (!textContent) {
+		throw new Error(`No content could be extracted from ${usesStdin ? 'stdin' : source}`);
+	}
+
+	// Format output
+	let output: string;
+
+	if (options.property) {
+		const property = options.property;
+		if (property in result) {
+			output = result[property as keyof typeof result]?.toString() || '';
+		} else {
+			throw new Error(`Property "${property}" not found in response`);
+		}
+	} else if (options.json) {
+		output = JSON.stringify({
+			content: result.content,
+			title: result.title,
+			description: result.description,
+			domain: result.domain,
+			favicon: result.favicon,
+			image: result.image,
+			language: result.language,
+			metaTags: result.metaTags,
+			parseTime: result.parseTime,
+			published: result.published,
+			author: result.author,
+			site: result.site,
+			schemaOrgData: result.schemaOrgData,
+			wordCount: result.wordCount,
+			...(result.contentMarkdown ? { contentMarkdown: result.contentMarkdown } : {}),
+			...(result.variables ? { variables: result.variables } : {}),
+		}, null, 2);
+	} else {
+		output = result.content;
+	}
+
+	return { output };
+}
 
-			// Handle output
-			if (options.output) {
-				const outputPath = resolve(process.cwd(), options.output);
-				await writeFile(outputPath, output, 'utf-8');
-				console.log(ansi.green(`Output written to ${options.output}`));
-			} else {
-				console.log(output);
+export function createProgram(): Command {
+	const program = new Command();
+
+	program
+		.name('defuddle')
+		.description('Extract article content from web pages')
+		.version(version);
+
+	program
+		.command('parse')
+		.description('Parse HTML content from a file, URL, or stdin')
+		.argument('[source]', 'HTML file path, URL, or "-" to read from stdin')
+		.option('-o, --output <file>', 'Output file path (default: stdout)')
+		.option('-m, --markdown', 'Convert content to markdown format')
+		.option('--md', 'Alias for --markdown')
+		.option('-j, --json', 'Output as JSON with metadata and content')
+		.option('-p, --property <name>', 'Extract a specific property (e.g., title, description, domain)')
+		.option('--debug', 'Enable debug mode')
+		.option('-l, --lang <code>', 'Preferred language (BCP 47, e.g. en, fr, ja)')
+		.action(async (source: string | undefined, options: ParseOptions) => {
+			try {
+				const { output } = await parseSource(source, options);
+
+				// Handle output
+				if (options.output) {
+					const outputPath = resolve(process.cwd(), options.output);
+					await writeFile(outputPath, output, 'utf-8');
+					console.log(ansi.green(`Output written to ${options.output}`));
+				} else {
+					console.log(output);
+				}
+			} catch (error) {
+				console.error(ansi.red('Error:'), error instanceof Error ? error.message : 'Unknown error occurred');
+				process.exit(1);
 			}
-		} catch (error) {
-			console.error(ansi.red('Error:'), error instanceof Error ? error.message : 'Unknown error occurred');
-			process.exit(1);
-		}
-	});
+		});
 
-program.parse();
+	return program;
+}
+
+const program = createProgram();
+
+if (require.main === module) {
+	program.parse();
+}