Conversation
| return session | ||
|
|
||
| def get_page_content(self, url: str) -> Optional[str]: | ||
| """Get HTML content from a URL""" |
There was a problem hiding this comment.
Lets elaborate on the doc string
Get HTML content from a URL with a session time out of 30 seconds or minutes?
The session has a default of 3 retries ... etc.
| w3schools_scraper = W3SchoolsScraper(args.url, delay=args.delay) | ||
|
|
||
| # Create the application | ||
| app = ScraperApplication(w3schools_scraper) |
There was a problem hiding this comment.
Instead of Scraper application we will use ZenML pipeline for orchestration
| import logging | ||
| from abc import ABC, abstractmethod | ||
| from typing import Any | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class DataExporter(ABC): | ||
| """Abstract base class for data exporters""" | ||
|
|
||
| def __init__(self): | ||
| self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") | ||
|
|
||
| @abstractmethod | ||
| def export(self, data: Any, filename: str) -> None: | ||
| """Export data to a file""" | ||
| pass No newline at end of file |
There was a problem hiding this comment.
Not required for now as we will be storing the text in NoSQL Mongodb
| from typing import List, Dict, Any, Optional | ||
| from pydantic import BaseModel, Field, validator | ||
|
|
||
|
|
||
| class TutorialLink(BaseModel): | ||
| """Model for a tutorial link in the sidebar""" | ||
| title: str | ||
| url: str | ||
|
|
||
|
|
||
| class CodeExample(BaseModel): | ||
| """Model for a code example""" | ||
| language: str = "javascript" # Default language | ||
| code: str | ||
|
|
||
|
|
||
| class TutorialContent(BaseModel): | ||
| """Model for the content of a tutorial page""" | ||
| title: str | ||
| url: str | ||
| content: str | ||
| code_examples: List[CodeExample] = [] | ||
| next_link: Optional[str] = None | ||
|
|
||
| @validator('content') | ||
| def content_not_empty(cls, v): | ||
| if not v or len(v.strip()) == 0: | ||
| raise ValueError('content cannot be empty') | ||
| return v | ||
|
|
||
|
|
||
| class TutorialCourse(BaseModel): | ||
| """Model for the entire tutorial course""" | ||
| title: str | ||
| source_url: str | ||
| modules: List[TutorialLink] | ||
| tutorials: List[TutorialContent] | ||
| metadata: Dict[str, Any] = Field(default_factory=dict) No newline at end of file |
There was a problem hiding this comment.
Great start on content data model
There was a problem hiding this comment.
We should store the URL tree in DB and version it with date
The content extracted should have this version and the URL path of the tree in the content data model so we can easily fetched related content
There was a problem hiding this comment.
Please add all the metadata files to gitignore and remove from the PR
No description provided.