Skip to content

Commit 7d8ec24

Browse files
authored
feat(firecrawl): add parse operation and revert short-input selection style (#4340)
* feat(firecrawl): add parse operation and revert short-input selection style * chore(firecrawl): regenerate docs and integrations data for parse * fix(firecrawl): forward firecrawl error body in parse route response * fix(firecrawl): add pricing config to parse tool hosting
1 parent 8d042f7 commit 7d8ec24

10 files changed

Lines changed: 611 additions & 8 deletions

File tree

apps/docs/content/docs/en/tools/firecrawl.mdx

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,4 +234,48 @@ Autonomous web data extraction agent. Searches and gathers information based on
234234
| `expiresAt` | string | Timestamp when the results expire \(24 hours\) |
235235
| `sources` | object | Array of source URLs used by the agent |
236236

237+
### `firecrawl_parse`
238+
239+
Parse uploaded documents (PDF, DOCX, HTML, etc.) into clean markdown using Firecrawl. Supports .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls.
240+
241+
#### Input
242+
243+
| Parameter | Type | Required | Description |
244+
| --------- | ---- | -------- | ----------- |
245+
| `file` | file | Yes | Document file to be parsed |
246+
| `formats` | array | No | Output formats to return \(e.g., \["markdown"\]\). Defaults to markdown. |
247+
| `onlyMainContent` | boolean | No | Exclude headers, navs, footers. Defaults to true. |
248+
| `includeTags` | array | No | HTML tags to include |
249+
| `excludeTags` | array | No | HTML tags to exclude |
250+
| `timeout` | number | No | Timeout in milliseconds \(max 300000\). Defaults to 30000. |
251+
| `parsers` | array | No | Parser configuration \(e.g., \[\{ "type": "pdf" \}\]\) |
252+
| `removeBase64Images` | boolean | No | Remove base64 images, keep alt text. Defaults to true. |
253+
| `blockAds` | boolean | No | Block ads and popups. Defaults to true. |
254+
| `proxy` | string | No | Proxy mode: "basic" or "auto" |
255+
| `zeroDataRetention` | boolean | No | Enable zero data retention. Defaults to false. |
256+
| `apiKey` | string | Yes | Firecrawl API key |
257+
| `rateLimit` | string | No | No description |
258+
259+
#### Output
260+
261+
| Parameter | Type | Description |
262+
| --------- | ---- | ----------- |
263+
| `markdown` | string | Parsed document content in markdown format |
264+
| `summary` | string | Generated summary of the document |
265+
| `html` | string | Processed HTML content |
266+
| `rawHtml` | string | Unprocessed raw HTML content |
267+
| `screenshot` | string | Screenshot URL or base64 \(when requested\) |
268+
| `links` | array | URLs discovered in the document |
269+
| `metadata` | object | Document metadata |
270+
|`title` | string | Document title |
271+
|`description` | string | Document description |
272+
|`language` | string | Document language code |
273+
|`sourceURL` | string | Source URL |
274+
|`url` | string | Final URL |
275+
|`keywords` | string | Document keywords |
276+
|`statusCode` | number | HTTP status code |
277+
|`contentType` | string | Document content type |
278+
|`error` | string | Error message if parse failed |
279+
| `warning` | string | Warning message from the parse operation |
280+
237281

apps/docs/content/docs/en/tools/notion.mdx

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,6 @@ Create a new database in Notion with custom properties
256256

257257
### `notion_add_database_row`
258258

259-
Add a new row to a Notion database with specified properties
260-
261259
#### Input
262260

263261
| Parameter | Type | Required | Description |

apps/sim/app/(landing)/integrations/data/integrations.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4020,9 +4020,13 @@
40204020
{
40214021
"name": "Agent",
40224022
"description": "Autonomous web data extraction agent. Searches and gathers information based on natural language prompts without requiring specific URLs."
4023+
},
4024+
{
4025+
"name": "Parse Document",
4026+
"description": "Parse uploaded documents (PDF, DOCX, HTML, etc.) into clean markdown using Firecrawl. Supports .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls."
40234027
}
40244028
],
4025-
"operationCount": 6,
4029+
"operationCount": 7,
40264030
"triggers": [],
40274031
"triggerCount": 0,
40284032
"authType": "api-key",
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import { createLogger } from '@sim/logger'
2+
import { toError } from '@sim/utils/errors'
3+
import { type NextRequest, NextResponse } from 'next/server'
4+
import { z } from 'zod'
5+
import { checkInternalAuth } from '@/lib/auth/hybrid'
6+
import { generateRequestId } from '@/lib/core/utils/request'
7+
import { withRouteHandler } from '@/lib/core/utils/with-route-handler'
8+
import { RawFileInputSchema } from '@/lib/uploads/utils/file-schemas'
9+
import { processFilesToUserFiles } from '@/lib/uploads/utils/file-utils'
10+
import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server'
11+
12+
export const dynamic = 'force-dynamic'
13+
14+
const logger = createLogger('FirecrawlParseAPI')
15+
16+
const FirecrawlParseSchema = z.object({
17+
apiKey: z.string().min(1, 'API key is required'),
18+
file: RawFileInputSchema,
19+
options: z.record(z.unknown()).optional(),
20+
})
21+
22+
export const POST = withRouteHandler(async (request: NextRequest) => {
23+
const requestId = generateRequestId()
24+
25+
try {
26+
const authResult = await checkInternalAuth(request, { requireWorkflowId: false })
27+
28+
if (!authResult.success || !authResult.userId) {
29+
logger.warn(`[${requestId}] Unauthorized Firecrawl parse attempt`, {
30+
error: authResult.error || 'Missing userId',
31+
})
32+
return NextResponse.json(
33+
{ success: false, error: authResult.error || 'Unauthorized' },
34+
{ status: 401 }
35+
)
36+
}
37+
38+
const body = await request.json()
39+
const validatedData = FirecrawlParseSchema.parse(body)
40+
41+
const [userFile] = processFilesToUserFiles([validatedData.file], requestId, logger)
42+
if (!userFile) {
43+
return NextResponse.json({ success: false, error: 'File input is required' }, { status: 400 })
44+
}
45+
46+
logger.info(`[${requestId}] Firecrawl parse request`, {
47+
fileName: userFile.name,
48+
size: userFile.size,
49+
})
50+
51+
const buffer = await downloadFileFromStorage(userFile, requestId, logger)
52+
53+
const formData = new FormData()
54+
const blob = new Blob([new Uint8Array(buffer)], {
55+
type: userFile.type || 'application/octet-stream',
56+
})
57+
formData.append('file', blob, userFile.name)
58+
59+
if (validatedData.options && Object.keys(validatedData.options).length > 0) {
60+
formData.append('options', JSON.stringify(validatedData.options))
61+
}
62+
63+
const firecrawlResponse = await fetch('https://api.firecrawl.dev/v2/parse', {
64+
method: 'POST',
65+
headers: {
66+
Authorization: `Bearer ${validatedData.apiKey}`,
67+
},
68+
body: formData,
69+
})
70+
71+
if (!firecrawlResponse.ok) {
72+
const errorText = await firecrawlResponse.text()
73+
logger.error(`[${requestId}] Firecrawl API error:`, errorText)
74+
return NextResponse.json(
75+
{
76+
success: false,
77+
error: `Firecrawl API error: ${errorText || firecrawlResponse.statusText}`,
78+
},
79+
{ status: firecrawlResponse.status }
80+
)
81+
}
82+
83+
const firecrawlData = await firecrawlResponse.json()
84+
85+
logger.info(`[${requestId}] Firecrawl parse successful`)
86+
87+
return NextResponse.json({
88+
success: true,
89+
output: firecrawlData.data ?? firecrawlData,
90+
})
91+
} catch (error) {
92+
if (error instanceof z.ZodError) {
93+
logger.warn(`[${requestId}] Invalid request data`, { errors: error.errors })
94+
return NextResponse.json(
95+
{ success: false, error: 'Invalid request data', details: error.errors },
96+
{ status: 400 }
97+
)
98+
}
99+
100+
logger.error(`[${requestId}] Error in Firecrawl parse:`, error)
101+
102+
return NextResponse.json({ success: false, error: toError(error).message }, { status: 500 })
103+
}
104+
})

apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/editor/components/sub-block/components/short-input/short-input.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ export const ShortInput = memo(function ShortInput({
347347
<>
348348
<Input
349349
ref={ref as React.RefObject<HTMLInputElement>}
350-
className='allow-scroll w-full overflow-auto text-transparent caret-foreground [-ms-overflow-style:none] [scrollbar-width:none] selection:text-transparent placeholder:text-muted-foreground/50 [&::-webkit-scrollbar]:hidden'
350+
className='allow-scroll w-full overflow-auto text-transparent caret-foreground [-ms-overflow-style:none] [scrollbar-width:none] placeholder:text-muted-foreground/50 [&::-webkit-scrollbar]:hidden'
351351
readOnly={readOnly}
352352
placeholder={placeholder ?? ''}
353353
type='text'

0 commit comments

Comments
 (0)