Skip to content

Commit 4615a3e

Browse files
authored
Merge pull request #8 from SentienceAPI/markdown
handle markdown read
2 parents 75fc4de + 4e63364 commit 4615a3e

4 files changed

Lines changed: 220 additions & 83 deletions

File tree

.github/workflows/sync-extension.yml

Lines changed: 58 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -63,36 +63,25 @@ jobs:
6363
mkdir -p extension-temp
6464
cd extension-temp
6565
66-
# First, try to download the zip archive if available
67-
ZIP_URL=$(curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
66+
# Download individual files from release (reliable method - no zip)
67+
echo "📁 Downloading individual files from release..."
68+
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
6869
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
69-
jq -r '.assets[] | select(.name == "extension-package.zip") | .browser_download_url')
70-
71-
if [ -n "$ZIP_URL" ] && [ "$ZIP_URL" != "null" ]; then
72-
echo "📦 Downloading extension-package.zip..."
73-
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$ZIP_URL" -o extension-package.zip
74-
unzip -q extension-package.zip -d .
75-
# Files should now be in extension-temp/extension-package/ or extension-temp/
76-
if [ -d "extension-package" ]; then
77-
mv extension-package/* . 2>/dev/null || true
78-
rmdir extension-package 2>/dev/null || true
79-
fi
80-
else
81-
echo "📁 Downloading individual files from release..."
82-
# Download each file from release
83-
curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
84-
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
85-
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \
86-
while read url; do
87-
if [ -n "$url" ] && [ "$url" != "null" ]; then
88-
filename=$(basename "$url")
89-
echo " Downloading $filename..."
90-
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename"
70+
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | "\(.browser_download_url)|\(.name)"' | \
71+
while IFS='|' read -r url name; do
72+
if [ -n "$url" ] && [ "$url" != "null" ] && [ -n "$name" ]; then
73+
# Preserve directory structure from asset name
74+
# If name contains '/', create directories
75+
dir=$(dirname "$name")
76+
if [ "$dir" != "." ]; then
77+
mkdir -p "$dir"
9178
fi
92-
done
93-
fi
79+
echo " Downloading $name..."
80+
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$name"
81+
fi
82+
done
9483
95-
# Verify files were downloaded
84+
# Verify downloaded files
9685
echo "📋 Downloaded files:"
9786
ls -la
9887
@@ -102,34 +91,62 @@ jobs:
10291
# Create extension directory structure
10392
mkdir -p src/extension/pkg
10493
105-
# Copy extension files (check both root and pkg subdirectory)
106-
cp extension-temp/manifest.json src/extension/ 2>/dev/null || echo "⚠️ manifest.json not found in release"
107-
cp extension-temp/content.js src/extension/ 2>/dev/null || echo "⚠️ content.js not found in release"
108-
cp extension-temp/background.js src/extension/ 2>/dev/null || echo "⚠️ background.js not found in release"
109-
cp extension-temp/injected_api.js src/extension/ 2>/dev/null || echo "⚠️ injected_api.js not found in release"
94+
# Copy extension files (handle both root and extension-package/ subdirectory)
95+
# Check root first, then extension-package/ subdirectory
96+
if [ -f "extension-temp/manifest.json" ]; then
97+
cp extension-temp/manifest.json src/extension/
98+
elif [ -f "extension-temp/extension-package/manifest.json" ]; then
99+
cp extension-temp/extension-package/manifest.json src/extension/
100+
else
101+
echo "⚠️ manifest.json not found"
102+
fi
103+
104+
if [ -f "extension-temp/content.js" ]; then
105+
cp extension-temp/content.js src/extension/
106+
elif [ -f "extension-temp/extension-package/content.js" ]; then
107+
cp extension-temp/extension-package/content.js src/extension/
108+
else
109+
echo "⚠️ content.js not found"
110+
fi
111+
112+
if [ -f "extension-temp/background.js" ]; then
113+
cp extension-temp/background.js src/extension/
114+
elif [ -f "extension-temp/extension-package/background.js" ]; then
115+
cp extension-temp/extension-package/background.js src/extension/
116+
else
117+
echo "⚠️ background.js not found"
118+
fi
119+
120+
if [ -f "extension-temp/injected_api.js" ]; then
121+
cp extension-temp/injected_api.js src/extension/
122+
elif [ -f "extension-temp/extension-package/injected_api.js" ]; then
123+
cp extension-temp/extension-package/injected_api.js src/extension/
124+
else
125+
echo "⚠️ injected_api.js not found"
126+
fi
110127
111-
# Copy WASM files (check both root and pkg subdirectory)
128+
# Copy WASM files (check both locations)
112129
if [ -f "extension-temp/pkg/sentience_core.js" ]; then
113130
cp extension-temp/pkg/sentience_core.js src/extension/pkg/
114-
elif [ -f "extension-temp/sentience_core.js" ]; then
115-
cp extension-temp/sentience_core.js src/extension/pkg/
131+
elif [ -f "extension-temp/extension-package/pkg/sentience_core.js" ]; then
132+
cp extension-temp/extension-package/pkg/sentience_core.js src/extension/pkg/
116133
else
117134
echo "⚠️ sentience_core.js not found"
118135
fi
119136
120137
if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then
121138
cp extension-temp/pkg/sentience_core_bg.wasm src/extension/pkg/
122-
elif [ -f "extension-temp/sentience_core_bg.wasm" ]; then
123-
cp extension-temp/sentience_core_bg.wasm src/extension/pkg/
139+
elif [ -f "extension-temp/extension-package/pkg/sentience_core_bg.wasm" ]; then
140+
cp extension-temp/extension-package/pkg/sentience_core_bg.wasm src/extension/pkg/
124141
else
125142
echo "⚠️ sentience_core_bg.wasm not found"
126143
fi
127144
128145
# Copy TypeScript definitions
129146
if [ -d "extension-temp/pkg" ]; then
130147
cp extension-temp/pkg/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
131-
elif [ -d "extension-temp" ]; then
132-
cp extension-temp/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
148+
elif [ -d "extension-temp/extension-package/pkg" ]; then
149+
cp extension-temp/extension-package/pkg/*.d.ts src/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
133150
fi
134151
135152
# Verify copied files
@@ -156,9 +173,9 @@ jobs:
156173
if: steps.release.outputs.skip != 'true' && steps.changes.outputs.changed == 'true'
157174
uses: peter-evans/create-pull-request@v5
158175
with:
159-
# Use GITHUB_TOKEN (built-in) if repository allows PR creation, otherwise use PR_TOKEN (PAT)
176+
# Use PR_TOKEN if available (for repos with org restrictions), otherwise use GITHUB_TOKEN
160177
# To use PAT: create secret named PR_TOKEN with a Personal Access Token that has 'repo' scope
161-
token: ${{ secrets.PR_TOKEN || secrets.GITHUB_TOKEN }}
178+
token: ${{ secrets.PR_TOKEN }}
162179
commit-message: "chore: sync extension files from sentience-chrome ${{ steps.release.outputs.tag }}"
163180
title: "Sync Extension: ${{ steps.release.outputs.tag }}"
164181
body: |

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,36 @@ npm run build
105105
- `.toHaveText(text)`
106106
- `.toHaveCount(n)`
107107

108+
### Content Reading
109+
- `read(browser, options)` - Read page content
110+
- **Default format: `"raw"`** - Returns HTML suitable for Turndown
111+
- `format: "raw"` - Get cleaned HTML
112+
- `format: "markdown"` - Get high-quality markdown (uses Turndown internally)
113+
- `format: "text"` - Get plain text
114+
115+
**Examples:**
116+
```typescript
117+
import { read } from './src';
118+
119+
// Get raw HTML (default)
120+
const result = await read(browser);
121+
const html = result.content;
122+
123+
// Get high-quality markdown (uses Turndown automatically)
124+
const result = await read(browser, { format: 'markdown' });
125+
const markdown = result.content;
126+
```
127+
128+
See `examples/read-markdown.ts` for complete examples.
129+
108130
## Examples
109131

110132
See `examples/` directory:
111133
- `hello.ts` - Extension bridge verification
112134
- `basic-agent.ts` - Basic snapshot
113135
- `query-demo.ts` - Query engine
114136
- `wait-and-click.ts` - Wait and actions
137+
- `read-markdown.ts` - Reading page content and converting to markdown
115138

116139
### Content Reading Example
117140

examples/read-markdown.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/**
2+
* Example: Reading page content and converting to markdown
3+
*
4+
* This example shows how to use the read() function to get page content
5+
* and convert it to high-quality markdown using Turndown.
6+
*/
7+
8+
import { SentienceBrowser, read } from '../src';
9+
import TurndownService from 'turndown';
10+
11+
async function main() {
12+
// Initialize browser
13+
const browser = new SentienceBrowser();
14+
await browser.start();
15+
16+
try {
17+
// Navigate to a page
18+
await browser.getPage().goto('https://example.com');
19+
await browser.getPage().waitForLoadState('networkidle');
20+
21+
// Method 1: Get raw HTML (default) and convert with Turndown
22+
console.log('=== Method 1: Raw HTML + Turndown (Recommended) ===');
23+
const result = await read(browser); // format="raw" is default
24+
const htmlContent = result.content;
25+
26+
// Convert to markdown using Turndown (better quality)
27+
const turndownService = new TurndownService({
28+
headingStyle: 'atx', // Use # for headings
29+
bulletListMarker: '-', // Use - for lists
30+
codeBlockStyle: 'fenced', // Use ``` for code blocks
31+
});
32+
33+
// Add custom rules for better conversion
34+
turndownService.addRule('strikethrough', {
35+
filter: ['del', 's', 'strike'] as any,
36+
replacement: (content: string) => `~~${content}~~`,
37+
});
38+
39+
// Strip unwanted tags
40+
turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']);
41+
42+
const markdown = turndownService.turndown(htmlContent);
43+
console.log(`Markdown length: ${markdown.length} characters`);
44+
console.log(markdown.substring(0, 500)); // Print first 500 chars
45+
console.log('\n');
46+
47+
// Method 2: Get high-quality markdown directly (uses Turndown internally)
48+
console.log('=== Method 2: Direct markdown (High-quality via Turndown) ===');
49+
const result2 = await read(browser, { format: 'markdown' });
50+
const highQualityMarkdown = result2.content;
51+
console.log(`Markdown length: ${highQualityMarkdown.length} characters`);
52+
console.log(highQualityMarkdown.substring(0, 500)); // Print first 500 chars
53+
console.log('\n');
54+
55+
// Method 3: Get plain text
56+
console.log('=== Method 3: Plain text ===');
57+
const result3 = await read(browser, { format: 'text' });
58+
const textContent = result3.content;
59+
console.log(`Text length: ${textContent.length} characters`);
60+
console.log(textContent.substring(0, 500)); // Print first 500 chars
61+
} finally {
62+
await browser.close();
63+
}
64+
}
65+
66+
main().catch(console.error);
67+

src/read.ts

Lines changed: 72 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,80 +1,110 @@
11
/**
2-
* Read page content - enhanced markdown conversion
2+
* Read page content - supports raw HTML, text, and markdown formats
33
*/
44

55
import { SentienceBrowser } from './browser';
66
import TurndownService from 'turndown';
77

88
export interface ReadOptions {
9-
format?: 'text' | 'markdown';
10-
enhance_markdown?: boolean;
9+
format?: 'raw' | 'text' | 'markdown';
10+
enhanceMarkdown?: boolean;
1111
}
1212

1313
export interface ReadResult {
1414
status: 'success' | 'error';
1515
url: string;
16-
format: 'text' | 'markdown';
16+
format: 'raw' | 'text' | 'markdown';
1717
content: string;
1818
length: number;
1919
error?: string;
2020
}
2121

2222
/**
23-
* Read page content as text or markdown
23+
* Read page content as raw HTML, text, or markdown
2424
*
2525
* @param browser - SentienceBrowser instance
2626
* @param options - Read options
2727
* @returns ReadResult with page content
28+
*
29+
* @example
30+
* // Get raw HTML (default)
31+
* const result = await read(browser);
32+
* const htmlContent = result.content;
33+
*
34+
* @example
35+
* // Get high-quality markdown (uses Turndown internally)
36+
* const result = await read(browser, { format: 'markdown' });
37+
* const markdown = result.content;
38+
*
39+
* @example
40+
* // Get plain text
41+
* const result = await read(browser, { format: 'text' });
42+
* const text = result.content;
2843
*/
2944
export async function read(
3045
browser: SentienceBrowser,
3146
options: ReadOptions = {}
3247
): Promise<ReadResult> {
3348
const page = browser.getPage();
34-
const format = options.format || 'text';
35-
const enhanceMarkdown = options.enhance_markdown !== false; // Default to true
36-
37-
// Get basic content from extension
38-
const result = (await page.evaluate(
39-
(opts) => {
40-
return (window as any).sentience.read(opts);
41-
},
42-
{ format }
43-
)) as ReadResult;
49+
const format = options.format || 'raw'; // Default to 'raw' for Turndown compatibility
50+
const enhanceMarkdown = options.enhanceMarkdown !== false; // Default to true
4451

45-
// Enhance markdown if requested and format is markdown
46-
if (format === 'markdown' && enhanceMarkdown && result.status === 'success') {
47-
try {
48-
// Get full HTML from page
49-
const htmlContent = await page.evaluate(
50-
() => document.documentElement.outerHTML
51-
);
52+
if (format === 'markdown' && enhanceMarkdown) {
53+
// Get raw HTML from the extension first
54+
const rawHtmlResult = (await page.evaluate(
55+
(opts) => {
56+
return (window as any).sentience.read(opts);
57+
},
58+
{ format: 'raw' }
59+
)) as ReadResult;
5260

53-
// Use turndown for better conversion
54-
const turndownService = new TurndownService({
55-
headingStyle: 'atx', // Use # for headings
56-
bulletListMarker: '-', // Use - for lists
57-
codeBlockStyle: 'fenced', // Use ``` for code blocks
58-
});
61+
if (rawHtmlResult.status === 'success') {
62+
const htmlContent = rawHtmlResult.content;
63+
try {
64+
const turndownService = new TurndownService({
65+
headingStyle: 'atx',
66+
hr: '---',
67+
bulletListMarker: '-',
68+
codeBlockStyle: 'fenced',
69+
emDelimiter: '*',
70+
});
5971

60-
// Add custom rules for better conversion
61-
turndownService.addRule('strikethrough', {
62-
filter: ['del', 's', 'strike'] as any,
63-
replacement: (content: string) => `~~${content}~~`,
64-
});
72+
// Add custom rules for better markdown
73+
turndownService.addRule('strikethrough', {
74+
filter: (node) => ['s', 'del', 'strike'].includes(node.nodeName.toLowerCase()),
75+
replacement: function (content) {
76+
return '~~' + content + '~~';
77+
},
78+
});
6579

66-
// Strip unwanted tags
67-
turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']);
80+
// Optionally strip certain tags entirely
81+
turndownService.remove(['script', 'style', 'noscript', 'iframe'] as any);
6882

69-
const enhancedMarkdown = turndownService.turndown(htmlContent);
70-
result.content = enhancedMarkdown;
71-
result.length = enhancedMarkdown.length;
72-
} catch (e) {
73-
// If enhancement fails, use extension's result
74-
result.error = `Markdown enhancement failed: ${e}`;
83+
const markdownContent = turndownService.turndown(htmlContent);
84+
return {
85+
status: 'success',
86+
url: rawHtmlResult.url,
87+
format: 'markdown',
88+
content: markdownContent,
89+
length: markdownContent.length,
90+
};
91+
} catch (e: any) {
92+
console.warn(`Turndown conversion failed: ${e.message}, falling back to extension's markdown.`);
93+
// Fallback to extension's markdown if Turndown fails
94+
}
95+
} else {
96+
console.warn(`Failed to get raw HTML from extension: ${rawHtmlResult.error}, falling back to extension's markdown.`);
97+
// Fallback to extension's markdown if getting raw HTML fails
7598
}
7699
}
77100

101+
// If not enhanced markdown, or fallback, call extension with requested format
102+
const result = (await page.evaluate(
103+
(opts) => {
104+
return (window as any).sentience.read(opts);
105+
},
106+
{ format }
107+
)) as ReadResult;
108+
78109
return result;
79110
}
80-

0 commit comments

Comments
 (0)