-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
127 lines (103 loc) · 3.32 KB
/
main.py
File metadata and controls
127 lines (103 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
OpenPull API Service
FastAPI wrapper for OpenPull scraper
"""
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
import os
import asyncio
from typing import Optional
import sys
from pathlib import Path
# Add the parent directory to the path so we can import openpull
sys.path.insert(0, str(Path(__file__).parent))
from openpull.scraper import FlexibleScraper
app = FastAPI(title="OpenPull API", version="1.0.0")
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize scraper with API key from environment
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
if not GEMINI_API_KEY:
print("⚠️ WARNING: GEMINI_API_KEY not set. Scraper will not work without it.")
scraper = None
async def get_scraper():
"""Lazy initialization of scraper"""
global scraper
if scraper is None:
if not GEMINI_API_KEY:
raise HTTPException(status_code=500, detail="GEMINI_API_KEY not configured")
scraper = FlexibleScraper(api_key=GEMINI_API_KEY)
return scraper
class ScrapeRequest(BaseModel):
url: str = Field(..., description="URL to scrape")
prompt: Optional[str] = Field(None, description="Optional prompt for LLM extraction")
schema: Optional[dict] = Field(None, alias="schema", description="Optional JSON schema for structured output")
class ScrapeResponse(BaseModel):
success: bool
content: Optional[str] = None
data: Optional[dict] = None
error: Optional[str] = None
url: str
@app.get("/health")
async def health():
"""Health check endpoint"""
return {
"status": "healthy",
"service": "openpull-api",
"gemini_configured": GEMINI_API_KEY is not None,
}
@app.post("/v1/scrape", response_model=ScrapeResponse)
async def scrape(request: ScrapeRequest):
"""
Scrape a webpage using OpenPull
Returns scraped content, optionally structured via LLM extraction
"""
try:
scraper_instance = await get_scraper()
# Scrape with optional prompt and schema
result = await scraper_instance.scrape(
url=request.url,
prompt=request.prompt,
schema=request.schema,
)
# Extract content from result
# OpenPull returns different formats, handle both
if isinstance(result, dict):
content = result.get("content") or result.get("text") or str(result)
data = result.get("data") or result
else:
content = str(result)
data = None
return ScrapeResponse(
success=True,
content=content,
data=data,
url=request.url,
)
except Exception as e:
error_msg = str(e)
print(f"❌ Scrape error for {request.url}: {error_msg}")
return ScrapeResponse(
success=False,
error=error_msg,
url=request.url,
)
@app.get("/")
async def root():
"""Root endpoint with API info"""
return {
"service": "OpenPull API",
"version": "1.0.0",
"endpoints": {
"health": "/health",
"scrape": "/v1/scrape",
},
"docs": "/docs",
}