diff --git a/README.md b/README.md index 51dd99936..576c4163a 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ _Note: for `defuddle/node` to import properly, the module format in your `packag ### CLI -Defuddle includes a command-line interface for parsing web pages directly from the terminal. You can run it with `npx` or [install it globally](#cli-installation). +Defuddle includes a command-line interface for parsing web pages directly from the terminal. You can run it with `npx` or [install it globally](#cli-installation). The CLI accepts a file path, a URL, or HTML piped over stdin. ```bash # Parse a local HTML file @@ -74,6 +74,12 @@ npx defuddle parse page.html # Parse a URL npx defuddle parse https://example.com/article +# Parse HTML from stdin +cat page.html | npx defuddle parse + +# Parse fetched HTML from stdin as markdown +curl -L https://stephango.com/saw | npx defuddle parse --markdown + # Output as markdown npx defuddle parse page.html --markdown @@ -102,6 +108,8 @@ npx defuddle parse page.html --debug | `--debug` | | Enable debug mode | | `--lang ` | `-l` | Preferred language (BCP 47, e.g. `en`, `fr`, `ja`) | +When no `` argument is provided, `defuddle parse` reads HTML from stdin. You can also pass `-` explicitly to force stdin input. + ## Installation ```bash diff --git a/src/cli.ts b/src/cli.ts index 218d0136d..85d68135c 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -8,7 +8,7 @@ import { parseLinkedomHTML } from './utils/linkedom-compat'; import { countWords } from './utils'; import { getInitialUA, fetchPage, extractRawMarkdown, cleanMarkdownContent, BOT_UA } from './fetch'; -interface ParseOptions { +export interface ParseOptions { output?: string; markdown?: boolean; md?: boolean; @@ -18,6 +18,10 @@ interface ParseOptions { lang?: string; } +interface ParseResult { + output: string; +} + // ANSI color helpers (avoids chalk dependency which is ESM-only) const useColor = process.stdout.isTTY ?? false; const ansi = { @@ -28,134 +32,165 @@ const ansi = { // Read version from package.json const version = require('../package.json').version; -const program = new Command(); - -program - .name('defuddle') - .description('Extract article content from web pages') - .version(version); - -program - .command('parse') - .description('Parse HTML content from a file or URL') - .argument('', 'HTML file path or URL to parse') - .option('-o, --output ', 'Output file path (default: stdout)') - .option('-m, --markdown', 'Convert content to markdown format') - .option('--md', 'Alias for --markdown') - .option('-j, --json', 'Output as JSON with metadata and content') - .option('-p, --property ', 'Extract a specific property (e.g., title, description, domain)') - .option('--debug', 'Enable debug mode') - .option('-l, --lang ', 'Preferred language (BCP 47, e.g. en, fr, ja)') - .action(async (source: string, options: ParseOptions) => { - try { - // Handle --md alias - if (options.md) { - options.markdown = true; - } +export async function readStdin(input: NodeJS.ReadStream = process.stdin): Promise { + return new Promise((resolve, reject) => { + const chunks: string[] = []; + input.setEncoding('utf8'); + input.on('data', (chunk: string) => { + chunks.push(chunk); + }); + input.on('end', () => resolve(chunks.join(''))); + input.on('error', reject); + }); +} - const defuddleOpts = { - debug: options.debug, - markdown: options.markdown, - separateMarkdown: options.markdown || options.json, - language: options.lang, - }; - - let html: string; - let url: string | undefined; - - // Determine if source is a URL or file path - const isUrl = source.startsWith('http://') || source.startsWith('https://'); - if (isUrl) { - url = source; - const initialUA = getInitialUA(source); - html = await fetchPage(source, initialUA, options.lang); - } else { - const filePath = resolve(process.cwd(), source); - html = await readFile(filePath, 'utf-8'); - } +export async function parseSource(source: string | undefined, options: ParseOptions, input: NodeJS.ReadStream = process.stdin): Promise { + // Handle --md alias + if (options.md) { + options.markdown = true; + } - const doc = parseLinkedomHTML(html); - let result = await Defuddle(doc, url, defuddleOpts); - - // If no content was extracted from a URL, retry with bot UA. - // Some sites (e.g. Obsidian Publish) serve pre-rendered content to bots. - if (isUrl && result.wordCount === 0) { - try { - const botHtml = await fetchPage(source, BOT_UA, options.lang); - - // Check for raw markdown before DOM parsing destroys whitespace - const rawMarkdown = extractRawMarkdown(botHtml); - if (rawMarkdown) { - const botDoc = parseLinkedomHTML(botHtml); - const botResult = await Defuddle(botDoc, url, defuddleOpts); - botResult.content = cleanMarkdownContent(rawMarkdown); - botResult.wordCount = countWords(botResult.content); - result = botResult; - } else { - const botDoc = parseLinkedomHTML(botHtml); - const botResult = await Defuddle(botDoc, url, defuddleOpts); - if (botResult.wordCount > 0) { - result = botResult; - } - } - } catch { - // Bot UA may be blocked — use original result - } - } + const defuddleOpts = { + debug: options.debug, + markdown: options.markdown, + separateMarkdown: options.markdown || options.json, + language: options.lang, + }; - // Check if parsing produced meaningful content - const textContent = result.content.replace(/<[^>]*>/g, '').trim(); - if (!textContent) { - console.error(ansi.red(`Error: No content could be extracted from ${source}`)); - process.exit(1); - } + let html: string; + let url: string | undefined; - // Format output - let output: string; + const usesStdin = !source || source === '-'; + const isUrl = !usesStdin && (source.startsWith('http://') || source.startsWith('https://')); - if (options.property) { - const property = options.property; - if (property in result) { - output = result[property as keyof typeof result]?.toString() || ''; - } else { - console.error(ansi.red(`Error: Property "${property}" not found in response`)); - process.exit(1); - } - } else if (options.json) { - output = JSON.stringify({ - content: result.content, - title: result.title, - description: result.description, - domain: result.domain, - favicon: result.favicon, - image: result.image, - language: result.language, - metaTags: result.metaTags, - parseTime: result.parseTime, - published: result.published, - author: result.author, - site: result.site, - schemaOrgData: result.schemaOrgData, - wordCount: result.wordCount, - ...(result.contentMarkdown ? { contentMarkdown: result.contentMarkdown } : {}), - ...(result.variables ? { variables: result.variables } : {}), - }, null, 2); + if (usesStdin) { + if (input.isTTY) { + throw new Error('No input source provided. Pass a file path or URL, or pipe HTML to stdin.'); + } + html = await readStdin(input); + } else if (isUrl) { + url = source; + const initialUA = getInitialUA(source); + html = await fetchPage(source, initialUA, options.lang); + } else { + const filePath = resolve(process.cwd(), source); + html = await readFile(filePath, 'utf-8'); + } + + const doc = parseLinkedomHTML(html); + let result = await Defuddle(doc, url, defuddleOpts); + + // If no content was extracted from a URL, retry with bot UA. + // Some sites (e.g. Obsidian Publish) serve pre-rendered content to bots. + if (isUrl && result.wordCount === 0) { + try { + const botHtml = await fetchPage(source, BOT_UA, options.lang); + + // Check for raw markdown before DOM parsing destroys whitespace + const rawMarkdown = extractRawMarkdown(botHtml); + if (rawMarkdown) { + const botDoc = parseLinkedomHTML(botHtml); + const botResult = await Defuddle(botDoc, url, defuddleOpts); + botResult.content = cleanMarkdownContent(rawMarkdown); + botResult.wordCount = countWords(botResult.content); + result = botResult; } else { - output = result.content; + const botDoc = parseLinkedomHTML(botHtml); + const botResult = await Defuddle(botDoc, url, defuddleOpts); + if (botResult.wordCount > 0) { + result = botResult; + } } + } catch { + // Bot UA may be blocked — use original result + } + } + + // Check if parsing produced meaningful content + const textContent = parseLinkedomHTML(`${result.content}`) + .body.textContent?.trim() || ''; + if (!textContent) { + throw new Error(`No content could be extracted from ${usesStdin ? 'stdin' : source}`); + } + + // Format output + let output: string; + + if (options.property) { + const property = options.property; + if (property in result) { + output = result[property as keyof typeof result]?.toString() || ''; + } else { + throw new Error(`Property "${property}" not found in response`); + } + } else if (options.json) { + output = JSON.stringify({ + content: result.content, + title: result.title, + description: result.description, + domain: result.domain, + favicon: result.favicon, + image: result.image, + language: result.language, + metaTags: result.metaTags, + parseTime: result.parseTime, + published: result.published, + author: result.author, + site: result.site, + schemaOrgData: result.schemaOrgData, + wordCount: result.wordCount, + ...(result.contentMarkdown ? { contentMarkdown: result.contentMarkdown } : {}), + ...(result.variables ? { variables: result.variables } : {}), + }, null, 2); + } else { + output = result.content; + } + + return { output }; +} - // Handle output - if (options.output) { - const outputPath = resolve(process.cwd(), options.output); - await writeFile(outputPath, output, 'utf-8'); - console.log(ansi.green(`Output written to ${options.output}`)); - } else { - console.log(output); +export function createProgram(): Command { + const program = new Command(); + + program + .name('defuddle') + .description('Extract article content from web pages') + .version(version); + + program + .command('parse') + .description('Parse HTML content from a file, URL, or stdin') + .argument('[source]', 'HTML file path, URL, or "-" to read from stdin') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('-m, --markdown', 'Convert content to markdown format') + .option('--md', 'Alias for --markdown') + .option('-j, --json', 'Output as JSON with metadata and content') + .option('-p, --property ', 'Extract a specific property (e.g., title, description, domain)') + .option('--debug', 'Enable debug mode') + .option('-l, --lang ', 'Preferred language (BCP 47, e.g. en, fr, ja)') + .action(async (source: string | undefined, options: ParseOptions) => { + try { + const { output } = await parseSource(source, options); + + // Handle output + if (options.output) { + const outputPath = resolve(process.cwd(), options.output); + await writeFile(outputPath, output, 'utf-8'); + console.log(ansi.green(`Output written to ${options.output}`)); + } else { + console.log(output); + } + } catch (error) { + console.error(ansi.red('Error:'), error instanceof Error ? error.message : 'Unknown error occurred'); + process.exit(1); } - } catch (error) { - console.error(ansi.red('Error:'), error instanceof Error ? error.message : 'Unknown error occurred'); - process.exit(1); - } - }); + }); -program.parse(); + return program; +} + +const program = createProgram(); + +if (require.main === module) { + program.parse(); +} diff --git a/tests/cli.test.ts b/tests/cli.test.ts new file mode 100644 index 000000000..766dcb920 --- /dev/null +++ b/tests/cli.test.ts @@ -0,0 +1,71 @@ +import { describe, expect, test } from 'vitest'; +import { readFileSync, rmSync, writeFileSync, mkdtempSync } from 'fs'; +import { join } from 'path'; +import { tmpdir } from 'os'; +import { Readable } from 'stream'; +import { Defuddle } from '../src/node'; +import { parseSource } from '../src/cli'; +import { parseDocument } from './helpers'; + +const fixturePath = join(__dirname, 'fixtures', 'general--appendix-heading.html'); +const fixtureHtml = readFileSync(fixturePath, 'utf-8'); + +function createMockStdin(html: string, isTTY = false): NodeJS.ReadStream { + const stdin = Readable.from([html], { encoding: 'utf8' }) as NodeJS.ReadStream; + (stdin as NodeJS.ReadStream & { isTTY?: boolean }).isTTY = isTTY; + return stdin; +} + +async function getExpectedContent(html: string): Promise { + const doc = parseDocument(html); + const result = await Defuddle(doc); + return result.content; +} + +function stripHtmlAndNormalizeWhitespace(html: string): string { + return html + .replace(/<[^>]*>/g, ' ') + .replace(/\s+/g, ' ') + .trim(); +} + +describe('CLI parseSource', () => { + test('reads HTML from stdin when no source is provided', async () => { + const expected = await getExpectedContent(fixtureHtml); + + const result = await parseSource(undefined, {}, createMockStdin(fixtureHtml)); + + expect(stripHtmlAndNormalizeWhitespace(result.output)).toEqual(stripHtmlAndNormalizeWhitespace(expected)); + }); + + test('reads HTML from stdin when source is "-"', async () => { + const result = await parseSource('-', { json: true }, createMockStdin(fixtureHtml)); + const parsed = JSON.parse(result.output); + + expect(parsed.title).toBe('Article with Appendix'); + expect(parsed.content).toContain('Appendix I'); + }); + + test('continues to read local HTML files', async () => { + const tempDir = mkdtempSync(join(tmpdir(), 'defuddle-cli-')); + const filePath = join(tempDir, 'page.html'); + try { + writeFileSync(filePath, fixtureHtml, 'utf-8'); + + const expected = await getExpectedContent(fixtureHtml); + const result = await parseSource(filePath, {}); + + expect(stripHtmlAndNormalizeWhitespace(result.output)).toEqual(stripHtmlAndNormalizeWhitespace(expected)); + } finally { + rmSync(tempDir, { recursive: true, force: true }); + } + }); + + test('throws a helpful error when no source is provided and stdin is a TTY', async () => { + const stdin = createMockStdin('', true); + + await expect(parseSource(undefined, {}, stdin)).rejects.toThrow( + 'No input source provided. Pass a file path or URL, or pipe HTML to stdin.' + ); + }); +});