From 258f69caed274fd4bdb53b875e17534de9359327 Mon Sep 17 00:00:00 2001 From: "coderabbitai[bot]" <136622811+coderabbitai[bot]@users.noreply.github.com> Date: Wed, 19 Mar 2025 17:48:42 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`Parse`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docstrings generation was requested by @R5dan. * https://github.com/Better-MD/better-md/pull/7#issuecomment-2737378662 The following files were modified: * `BetterMD/__init__.py` * `BetterMD/elements/a.py` * `BetterMD/elements/code.py` * `BetterMD/elements/input.py` * `BetterMD/elements/symbol.py` * `BetterMD/elements/table.py` * `BetterMD/elements/text.py` * `BetterMD/elements/title.py` * `BetterMD/html/custom_html.py` * `BetterMD/markdown/custom_markdown.py` * `BetterMD/parse/collection.py` * `BetterMD/parse/html.py` * `BetterMD/parse/markdown.py` * `BetterMD/parse/typing.py` * `BetterMD/rst/custom_rst.py` --- BetterMD/__init__.py | 32 +- BetterMD/elements/a.py | 118 +++++- BetterMD/elements/code.py | 117 ++++- BetterMD/elements/input.py | 38 +- BetterMD/elements/symbol.py | 331 +++++++++++---- BetterMD/elements/table.py | 611 +++++++++++---------------- BetterMD/elements/text.py | 71 +++- BetterMD/elements/title.py | 57 +++ BetterMD/html/custom_html.py | 34 +- BetterMD/markdown/custom_markdown.py | 45 +- BetterMD/parse/collection.py | 65 +++ BetterMD/parse/html.py | 274 ++++++++++++ BetterMD/parse/markdown.py | 528 +++++++++++++++++++++++ BetterMD/parse/typing.py | 28 ++ BetterMD/rst/custom_rst.py | 41 +- 15 files changed, 1854 insertions(+), 536 deletions(-) create mode 100644 BetterMD/elements/title.py create mode 100644 BetterMD/parse/collection.py create mode 100644 BetterMD/parse/html.py create mode 100644 BetterMD/parse/markdown.py create mode 100644 BetterMD/parse/typing.py diff --git a/BetterMD/__init__.py b/BetterMD/__init__.py index 9e04a76..5a1602e 100644 --- a/BetterMD/__init__.py +++ b/BetterMD/__init__.py @@ -1,10 +1,26 @@ -import logging -from .elements import A, H1, H2, H3, H4, H5, H6, Head, OL, UL, LI, Text, Div, P, Span, Img, B, I, Br, Blockquote, Hr, Table, Tr, Td, Th, THead, TBody, Input, Code -from .html import CustomHTML -from .markdown import CustomMarkdown -from .rst import CustomRst +from .elements import * +from .parse import Collection, HTMLParser, MDParser, RSTParser +def from_html(html:'str'): + """ + Converts an HTML string into a Symbol object. + + Args: + html: A string containing HTML content to convert. + + Returns: + A Symbol object representing the parsed HTML. + """ + return Symbol.from_html(html) -def enable_debug_mode(): - logging.basicConfig(level=logging.DEBUG) - logger = logging.getLogger("BetterMD") +def from_md(md:'str'): + """ + Converts a Markdown string into a Symbol. + + Args: + md: A Markdown-formatted string. + + Returns: + The Symbol object generated from the Markdown input. + """ + return Symbol.from_md(md) \ No newline at end of file diff --git a/BetterMD/elements/a.py b/BetterMD/elements/a.py index d7ea329..3c2ea73 100644 --- a/BetterMD/elements/a.py +++ b/BetterMD/elements/a.py @@ -1,23 +1,117 @@ -from BetterMD.rst.custom_rst import CustomRst from .symbol import Symbol +from ..rst import CustomRst from ..markdown import CustomMarkdown -from ..html import CustomHTML +import re import typing as t -class MD(CustomMarkdown['A']): - def to_md(self, inner, symbol, parent, **kwargs): - return f"[{" ".join([e.to_md(**kwargs) for e in inner])}]({symbol.get_prop("href")})" +if t.TYPE_CHECKING: + from ..parse import Collection + +class MD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Convert a list of elements into a Markdown hyperlink. + + This function concatenates the Markdown representation of each element in the + `inner` list (using the element’s own `to_md` method) with a space separator, + and wraps the result in Markdown link syntax. The URL is obtained from the + `href` property of the given symbol via its `get_prop` method. + + Args: + inner: A list of elements that provide a Markdown representation. + symbol: An object with a "href" property accessed through `get_prop`. + parent: A parent element (unused in this conversion). + + Returns: + A string formatted as a Markdown hyperlink. + """ + return f"[{" ".join([e.to_md() for e in inner])}]({symbol.get_prop("href")})" + + def verify(self, text:'str'): + """ + Checks if the input text contains any Markdown hyperlink patterns. + + This function inspects the provided text for various Markdown link formats: + inline links (e.g., [text](url)), automatic links (e.g., ), and reference links + (e.g., [text][ref] with an associated [ref]: url declaration). It returns True if + any valid link pattern is detected; otherwise, it returns False. + """ + if re.findall("\[([^\]]+)\]\((https?:\/\/[^\s)]+)\)", text): + # Case 1: Inline link + return True + + elif re.findall("<(https?:\/\/[^\s>]+)>", text): + # Case 2: Automatic Links + return True + + elif re.findall("\[([^\]]+)\]\[([^\]]+)\]\s*\n?\[([^\]]+)\]:\s*(https?:\/\/[^\s]+)", text): + # Case 3: Reference Links + return True + + return False -class HTML(CustomHTML['A']): - def to_html(self, inner, symbol, parent, **kwargs): - return f"{" ".join([e.to_html(**kwargs) for e in inner])}" class RST(CustomRst['A']): - def to_rst(self, inner, symbol, parent, **kwargs): - return f"`{' '.join([e.to_rst(**kwargs) for e in inner])} <{symbol.get_prop('href')}>`_" + def to_rst(self, inner, symbol, parent): + """ + Converts inner elements into an RST hyperlink. + + Joins the reStructuredText representations of the inner elements using a space and + formats them as an RST hyperlink with the URL obtained from the symbol's "href" property. + The parent parameter is not used in the conversion. + + Parameters: + inner: A list of objects that implement a to_rst() method, representing the link text. + symbol: An object providing hyperlink properties, where the URL is retrieved via get_prop('href'). + parent: An unused parameter for interface consistency. + + Returns: + A string formatted as an RST hyperlink. + """ + return f"`{' '.join([e.to_rst() for e in inner])} <{symbol.get_prop('href')}>`_" class A(Symbol): prop_list = ["href"] + + refs = {} md = MD() - html = HTML() - rst = RST() \ No newline at end of file + html = "a" + rst = RST() + + @classmethod + def md_refs(cls, references: 'list[str]' = None): + """ + Process Markdown references. + + This placeholder class method accepts an optional list of Markdown reference + strings for future processing. Currently, no operations are performed. + + Args: + references (list[str], optional): A list of Markdown reference strings. + """ + pass + + @classmethod + def rst_refs(cls, references: 'list[str]' = None): + """ + Processes reStructuredText references. + + This placeholder method is intended for future implementation of RST reference + handling. If provided, the list of reference strings may later be validated, + transformed, or registered. The method currently performs no operations. + + Parameters: + references (list[str], optional): A list of RST reference strings. Defaults to None. + """ + pass + + @classmethod + def html_refs(cls, references: 'list[str]' = None): + """ + Processes HTML references from a list of reference strings. + + This class method is a placeholder for HTML reference processing. If provided, the + 'references' parameter should be an optional list of HTML reference strings to be + handled. Currently, the method does not perform any processing. + """ + pass \ No newline at end of file diff --git a/BetterMD/elements/code.py b/BetterMD/elements/code.py index 26ba46c..3235dd4 100644 --- a/BetterMD/elements/code.py +++ b/BetterMD/elements/code.py @@ -2,34 +2,125 @@ from .text import Text from ..markdown import CustomMarkdown from ..html import CustomHTML +from ..rst import CustomRst -class MD(CustomMarkdown['Code']): - def to_md(self, inner, symbol, parent, **kwargs): - language = symbol.get_prop("language", "") +class MD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Converts content into Markdown code formatting. + + If the input is a Text instance, it is first converted using its own Markdown method. When a language is specified in the symbol or the content is multiline, the method formats the content as a code block with triple backticks and a language identifier. Otherwise, it returns the content wrapped in single backticks for inline code. + + Parameters: + inner: The content to convert, which may be a raw string or a Text object. + symbol: An object from which the programming language is retrieved. + parent: A placeholder parameter for the parent element (currently unused). - content = " ".join([e.to_md(**kwargs) for e in inner]) + Returns: + A string containing the Markdown formatted code. + """ + language = symbol.get_prop("language", "") + if isinstance(inner, Text): + inner = inner.to_md() # If it's a code block (has language or multiline) if language or "\n" in inner: - return f"```{language}\n{content}\n```\n" + return f"```{language}\n{inner}\n```\n" # Inline code - return f"`{content}`" + return f"`{inner}`" class HTML(CustomHTML): - def to_html(self, inner, symbol, parent, **kwargs): - language = symbol.get_prop("language", "") + def to_html(self, inner, symbol, parent): + """ + Generate HTML markup for a code block. - content = " ".join([e.to_html(**kwargs) for e in inner]) + Converts a list of elements to their HTML representations by joining the results + of each item’s `to_html()` method with newline characters, and wraps the result in an + HTML tag. If the symbol specifies a programming language via its 'language' + property, a corresponding language-specific class is added to the tag. + + Parameters: + inner: A list of objects with a `to_html()` method. + symbol: An object with properties (including an optional 'language') used for formatting. + parent: A placeholder for potential hierarchical context (unused). + + Returns: + A string containing the HTML markup for the code. + """ + language = symbol.get_prop("language", "") + inner = "\n".join([i.to_html() for i in inner]) if language: - return f'
{content}
' + return f'{inner}' + + return f"{inner}" + + def verify(self, text: str) -> bool: + """ + Checks if the provided text equals "code", ignoring case. + + Args: + text: The string to verify. - return f"{content}" + Returns: + bool: True if text equals "code" (case-insensitive), otherwise False. + """ + return text.lower() == "code" + +class RST(CustomRst): + def to_rst(self, inner, symbol, parent): + """ + Convert inner content to reStructuredText code format. + + Transforms the provided content into its RST representation based on the language + specified in the symbol and whether the content spans multiple lines. When a language + is indicated or the content contains newlines, the function formats the text as a + code block using the appropriate directive and indentation. Otherwise, it returns the + content as inline code, escaping backticks if present. + + Parameters: + inner: Content to be converted, which may be a list of items (each supporting to_rst()) + or a single item. + symbol: An object used to retrieve properties (e.g., the programming language) for + formatting purposes. + parent: Unused parameter included for interface consistency. + + Returns: + str: The reStructuredText formatted representation of the code. + """ + language = symbol.get_prop("language", "") + + # Handle inner content + if isinstance(inner, list): + content = "".join([ + i.to_rst() if isinstance(i, Symbol) else str(i) + for i in inner + ]) + else: + content = inner.to_rst() if isinstance(inner, Symbol) else str(inner) + + # If it's a code block (has language or multiline) + if language or "\n" in content: + # Use code-block directive for language-specific blocks + if language: + # Indent the content by 3 spaces (RST requirement) + indented_content = "\n".join(f" {line}" for line in content.strip().split("\n")) + return f".. code-block:: {language}\n\n{indented_content}\n\n" + + # Use simple literal block for language-less blocks + # Indent the content by 3 spaces (RST requirement) + indented_content = "\n".join(f" {line}" for line in content.strip().split("\n")) + return f"::\n\n{indented_content}\n\n" + + # Inline code + # Escape backticks if they exist in content + if "`" in content: + return f"``{content}``" + return f"`{content}`" class Code(Symbol): - prop_list = ["language"] html = HTML() md = MD() - rst = "``" + rst = RST() nl = True \ No newline at end of file diff --git a/BetterMD/elements/input.py b/BetterMD/elements/input.py index 4cf9e4b..a189401 100644 --- a/BetterMD/elements/input.py +++ b/BetterMD/elements/input.py @@ -3,32 +3,28 @@ from ..markdown import CustomMarkdown from ..rst import CustomRst -class HTML(CustomHTML): - def to_html(self, inner, symbol, parent, **kwargs): - # Collect all input attributes - attrs = [] - for prop in Input.props: - value = symbol.get_prop(prop) - if value: - # Handle boolean attributes like 'required', 'disabled', etc. - if isinstance(value, bool) and value: - attrs.append(prop) - else: - attrs.append(f'{prop}="{value}"') - - attrs_str = " ".join(attrs) - return f"" - class MD(CustomMarkdown): - def to_md(self, inner, symbol, parent, **kwargs): + def to_md(self, inner, symbol, parent): + """ + Converts an input symbol to its Markdown representation. + + If the symbol is of type "checkbox", returns a Markdown list item that displays the + checkbox's status ("x" if checked, a space if not) followed by the inner content’s Markdown. + For other types, returns the symbol’s HTML representation. + """ if symbol.get_prop("type") == "checkbox": - return f"- [{'x' if symbol.get_prop('checked', '') else ''}] {inner.to_md()}" + return f"- [{'x' if symbol.get_prop('checked', '') else ' '}] {inner.to_md()}" return symbol.to_html() class RST(CustomRst): - def to_rst(self, inner, symbol, parent, **kwargs): + def to_rst(self, inner, symbol, parent): + """ + Generate a reStructuredText representation of an input symbol. + + If the symbol's type is "checkbox", returns a formatted checkbox with an "x" when checked or a space when not, optionally followed by inner content rendered in RST. For other input types, returns an empty string. + """ if symbol.get_prop("type") == "checkbox": - return f"[ ] {inner.to_rst() if inner else ''}" + return f"[{'x' if symbol.get_prop('checked', '') else ' '}] {inner.to_rst() if inner else ''}" return "" # Most input types don't have RST equivalents class Input(Symbol): @@ -50,6 +46,6 @@ class Input(Symbol): "multiple", "step" ] - html = HTML() + html = "input" md = MD() rst = RST() \ No newline at end of file diff --git a/BetterMD/elements/symbol.py b/BetterMD/elements/symbol.py index a01e96e..63c753a 100644 --- a/BetterMD/elements/symbol.py +++ b/BetterMD/elements/symbol.py @@ -1,72 +1,76 @@ import typing as t -import logging from ..markdown import CustomMarkdown from ..html import CustomHTML from ..rst import CustomRst - -T = t.TypeVar("T", default=t.Any) -T2 = t.TypeVar("T2", default=t.Any) -logger = logging.getLogger("BetterMD") - -class List(list, t.Generic[T]): - def on_set(self, key, value): ... - - def on_ammend(self, object: 'T'): ... - - - def append(self, object: 'T') -> 'None': - self.on_ammend(object) - return super().append(object) - - def get(self, index, default:'T2'=None) -> 't.Union[T, T2]': - try: - return self[index] - except IndexError: - return default - - def __setitem__(self, key, value): - self.on_set(key, value) - return super().__setitem__(key, value) - - def __getitem__(self, item) -> 'T': - return super().__getitem__(item) - - def __iter__(self) -> 't.Iterator[T]': - return super().__iter__() +from ..parse import HTMLParser, MDParser, RSTParser, ELEMENT, TEXT, Collection class Symbol: styles: 'dict[str, str]' = {} classes: 'list[str]' = [] - html: 't.Union[str, CustomHTML, CustomHTML[Symbol]]' = "" - props: 'dict[str, t.Union[str, list[str], dict[str, str]]]' = {} + html: 't.Union[str, CustomHTML]' = "" + props: 'dict[str, str]' = {} prop_list: 'list[str]' = [] vars:'dict[str,str]' = {} - children:'List[Symbol]' = List() - md: 't.Union[str, CustomMarkdown, CustomMarkdown[Symbol], None]' = None - rst: 't.Union[str, CustomRst, CustomRst[Symbol], None]' = None + children:'list[Symbol]' = [] + md: 't.Union[str, CustomMarkdown]' = "" + rst: 't.Union[str, CustomRst]' = "" parent:'Symbol' = None prepared:'bool' = False nl:'bool' = False html_written_props = "" - def __init__(self, styles:'dict[str,str]'={}, classes:'list[str]'=[], dom:'bool'=True, inner:'list[Symbol]'=[], **props): - logger.debug(f"Creating new Symbol with {styles=} {classes=} {dom=} {inner=} {props=}") + collection = Collection() + html_parser = HTMLParser() + md_parser = MDParser() + + def __init_subclass__(cls, **kwargs) -> None: + """ + Automatically registers a new subclass with the symbol collection. + + This method is invoked when a subclass is defined. It adds the subclass to the + class-level collection via the add_symbols method and passes any additional + keyword arguments to the superclass's __init_subclass__. + """ + cls.collection.add_symbols(cls) + super().__init_subclass__(**kwargs) + + def __init__(self, styles:'dict[str,str]'={}, classes:'list[str]'=[], inner:'list[Symbol]'=[], **props): + """ + Initialize a Symbol instance with optional styles, classes, children, and additional properties. + + Args: + styles: A dictionary mapping CSS property names to values. + classes: A list of CSS class names. + inner: A list of child Symbol instances. + **props: Additional properties to assign to the symbol. + """ self.styles = styles self.classes = classes - self.children = List(inner) or List() + self.children = list(inner) or [] self.props = props - self.dom = dom - + + def copy(self, styles:'dict[str,str]'={}, classes:'list[str]'=[], inner:'list[Symbol]'=None): + """ + Create a copy of the symbol with merged styles, specified classes, and inner symbols. + + The provided styles dictionary is updated with the symbol's current styles. If no inner symbols are given, an empty list is used. Returns a new Symbol instance with the combined attributes. + """ if inner == None: - inner = [Symbol()] + inner = [] styles.update(self.styles) return Symbol(styles, classes, inner = inner) - - + + def set_parent(self, parent:'Symbol'): + """ + Sets the parent for this symbol and registers it as a child of that parent. + + Args: + parent: The Symbol instance to assign as this symbol's parent. + """ self.parent = parent self.parent.add_child(self) @@ -81,83 +85,234 @@ def remove_child(self, symbol:'Symbol'): self.children.remove(symbol) def has_child(self, child:'type[Symbol]'): + """ + Check if the symbol has a child of the specified type. + + Iterates over the symbol's children and returns the first instance that is a subclass of the provided type. If no such child exists, returns False. + + Args: + child: The Symbol subclass type to search for among the children. + + Returns: + The first matching Symbol instance if found; otherwise, False. + """ for e in self.children: if isinstance(e, child): return e - + return False - def prepare(self, parent:'t.Union[Symbol, None]'=None, *args, **kwargs): - self.prepared = True - self.parent = parent + def prepare(self, parent:'Symbol'): + """ + Prepare the symbol and its children for processing. + + Marks the symbol as prepared, assigns the given parent as its parent, and + recursively prepares each child symbol by setting their parent to the current symbol. - [symbol.prepare(self, *args, **kwargs) for symbol in self.children] + Args: + parent: The parent symbol to associate with this symbol. + Returns: + The prepared symbol instance. + """ + self.prepared = True + self.parent = parent + for symbol in self.children: + symbol.prepare(self) + return self def replace_child(self, old:'Symbol', new:'Symbol'): + """ + Replace an existing child symbol with a new symbol in the children list. + + This function locates the first occurrence of the specified old symbol among the children, + removes it from the list, and then assigns the new symbol into the position immediately preceding + the old symbol’s original index. Note that if the old symbol is at the start of the list (index 0), + the new symbol will replace the last element due to negative indexing. + + Args: + old: The child symbol to be replaced. + new: The symbol that will take the place of the old child. + + Raises: + ValueError: If the old symbol is not found among the children. + """ i = self.children.index(old) self.children.remove(old) self.children[i-1] = new - - def to_html(self) -> 'str': - if not self.prepared: - self.prepare() + + def to_html(self, indent=1) -> 'str': + """ + Generates the HTML representation of the symbol and its children. + + If the symbol’s HTML attribute is a CustomHTML instance, the method delegates + the conversion to its to_html method. Otherwise, it constructs an HTML tag using + the symbol’s tag name, and dynamically includes attributes from its classes, + styles, and properties. Child symbols are recursively converted to HTML with + indentation controlled by the indent parameter; a self-closing tag is returned if + no inner content is generated. + Args: + indent (int): The current indentation level for formatting nested HTML. + + Returns: + str: The HTML string representation of the symbol. + """ if isinstance(self.html, CustomHTML): return self.html.to_html(self.children, self, self.parent) + + inner_HTML = f"\n{" "*indent}".join([e.to_html(indent+1) if not (len(self.children) == 1 and self.children[0].html == "text") else e.to_html(0) for e in self.children]) + return f"<{self.html}{" " if self.styles or self.classes or self.props else ""}{f"class={'"'}{' '.join(self.classes) or ''}{'"'}" if self.classes else ""}{" " if (self.styles or self.classes) and self.props else ""}{f"style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ""}{'"'}" if self.styles else ""}{" " if (self.styles or self.classes) and self.props else ""}{' '.join([f'{k}={'"'}{v}{'"'}' if v != "" else f'{k}' for k,v in self.props.items()])}{f">{"\n" if len(self.children) > 1 else ""}{inner_HTML}{"\n" if len(self.children) > 1 else ""}" if inner_HTML else f" />"}" + + def to_md(self) -> 'str': + """ + Converts the symbol to a Markdown formatted string. - props = [] - for prop, value in self.props.items(): - if isinstance(value, list): - props.append(f"{prop}={'"'}{' '.join(value)}{'"'}") - elif isinstance(value, dict): - props.append(f"{prop}={'"'}{' '.join([f'{k}:{v}' for k,v in value.items()])}{'"'}") - else: - props.append(f"{prop}={value}") - - inner_HTML = "\n".join([e.to_html() for e in self.children]) - logger.debug(f"{inner_HTML=} {self.html=} {self.classes=} {self.styles=} {props=}") - return f"<{self.html} class={'"'}{' '.join(self.classes) or ''}{'"'} style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ''}{'"'} {' '.join(props)}>{inner_HTML}" - - def to_md(self, **kwargs) -> 'str': - if not self.prepared: - self.prepare(**kwargs) - + If the symbol's markdown attribute is a CustomMarkdown instance, its custom conversion method + is used with the symbol's children, the symbol itself, and its parent. Otherwise, this method + concatenates the symbol's markdown content with the Markdown representations of its child symbols, + appending a newline if the symbol's 'nl' flag is set. + """ if isinstance(self.md, CustomMarkdown): - return self.md.to_md(self.children, self, self.parent, **kwargs) + return self.md.to_md(self.children, self, self.parent) + + inner_md = "".join([e.to_md() for e in self.children]) + return f"{self.md}{inner_md}" + ("\n" if self.nl else "") + + def to_rst(self) -> 'str': + """ + Converts the symbol and its children to reStructuredText format. - if self.md == None: - return self.to_html(**kwargs) + If the symbol’s rst attribute is a CustomRst instance, its to_rst() method is + called with the symbol’s children, the symbol itself, and its parent. Otherwise, + the method assembles the RST representation by concatenating the symbol’s rst + value, a space-separated string of its children's RST representations, and the + rst value again, followed by a newline. - inner_md = " ".join([e.to_md() for e in self.children]) - return f"{self.md} {inner_md}" + ("\n" if self.nl else "") - - def to_rst(self, **kwargs) -> 'str': - if not self.prepared: - self.prepare(**kwargs) - + Returns: + str: The complete reStructuredText representation of the symbol. + """ if isinstance(self.rst, CustomRst): return self.rst.to_rst(self.children, self, self.parent) - - if self.rst == None: - return f".. raw:: html\n\n{" ".join(self.to_html().splitlines())}\n" - + inner_rst = " ".join([e.to_rst() for e in self.children]) return f"{self.rst}{inner_rst}{self.rst}\n" - - def get_prop(self, prop, default="") -> 't.Union[str, list[str], dict[str, str]]': + + @classmethod + def from_html(cls, text:'str') -> 'list[Symbol]': + """ + Parses HTML content and returns a list of Symbol instances. + + This class method uses the class's HTML parser to transform the input HTML text into a list + of element dictionaries. For each element, it retrieves the corresponding symbol from the + collection by name and then parses the element into a Symbol instance. An error is raised + if no matching symbol is found. + + Args: + text: A string containing HTML content. + + Returns: + A list of Symbol instances generated from the parsed HTML elements. + """ + parsed = cls.html_parser.parse(text) + return [cls.collection.find_symbol(elm['name'] , raise_errors=True).parse(elm) for elm in parsed] + + @classmethod + def parse(cls, text:'ELEMENT') -> 'Symbol': + """ + Parses an element representation into a Symbol instance. + + This class method converts a structured element—provided as a dictionary containing keys + like "attributes", "children", "name", and "type"—into a corresponding Symbol. It extracts + inline CSS styles and class names from the element's attributes and recursively processes + its children. For text nodes (where the type is "text"), the method uses the designated + text symbol from the collection. + + Args: + text: A dictionary representing the element to be parsed. + + Returns: + A Symbol instance corresponding to the parsed element and its nested children. + """ + def handle_element(element:'ELEMENT|TEXT') -> 'Symbol': + if element['type'] == 'text': + text = cls.collection.find_symbol("text", raise_errors=True) + assert text is not None, "`collection.find_symbol` is broken" + + return text(element['content']) + + symbol_cls = cls.collection.find_symbol(element['name'], raise_errors=True) + assert symbol_cls is not None, "`collection.find_symbol` is broken" + + return symbol_cls.parse(element) + + styles = {s.split(":")[0]: s.split(":")[1] for s in text["attributes"].pop("style", "").split(";") if ":" in s} + classes = list(filter(lambda c: bool(c), text["attributes"].pop("class", "").split(" "))) + + return cls(styles, classes, inner=[handle_element(elm) for elm in text["children"]], **text["attributes"]) + + @classmethod + def from_md(cls, text: str) -> 'Symbol': + """ + Parses Markdown text into a Symbol instance. + + This class method converts the input Markdown into a structured form using the + Markdown parser, then retrieves and instantiates the corresponding Symbol from + the collection using the parsed symbol name. + + Args: + text: Markdown formatted text. + + Returns: + A Symbol instance representing the parsed Markdown content. + """ + parsed = cls.md_parser.parse(text) + return cls.collection.find_symbol(parsed['name'], raise_errors=True).parse(parsed) + + + + def get_prop(self, prop, default="") -> 'str': + """ + Retrieves a property value from the symbol's properties. + + Looks up the specified key in the properties dictionary and returns its value if found; + otherwise, returns the provided default value. + + Args: + prop: The property key to retrieve. + default: Value to return if the property is not present (defaults to an empty string). + + Returns: + The property value as a string. + """ return self.props.get(prop, default) - def set_prop(self, prop:'str', value:'t.Union[str, list[str], dict[str, str]]'): + def set_prop(self, prop, value): self.props[prop] = value def __contains__(self, item): + """ + Checks if a given item or type exists among the symbol's children. + + If the provided item is callable (typically a type), the method returns True + if any child is an instance of that type; otherwise, it performs a standard + membership check in the children list. + """ if callable(item): return any(isinstance(e, item) for e in self.children) return item in self.children def __str__(self): - return f"<{self.html} class={'"'}{' '.join(self.classes) or ''}{'"'} style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ''}{'"'} {' '.join(self.props)}/>" + """Return an HTML-like string representation of the symbol. + + Constructs a string that mimics an HTML element using the symbol's tag name and, + if present, includes attributes for CSS classes, inline styles, and additional properties. + When the symbol has more than one child, extra formatting with newlines is applied and + the count of child symbols is displayed between the opening and closing tags. + """ + return f"<{self.html}{" " if self.styles or self.classes or self.props else ""}{f"class={'"'}{' '.join(self.classes) or ''}{'"'}" if self.classes else ""}{" " if (self.styles or self.classes) and self.props else ""}{f"style={'"'}{' '.join([f'{k}:{v}' for k,v in self.styles.items()]) or ""}{'"'}" if self.styles else ""}{" " if (self.styles or self.classes) and self.props else ""}{' '.join([f'{k}={'"'}{v}{'"'}' if v != "" else f'{k}' for k,v in self.props.items()])}{f">{"\n" if len(self.children) > 1 else ""}{"\n" if len(self.children) > 1 else ""}{len(self.children)}"}" + + __repr__ = __str__ \ No newline at end of file diff --git a/BetterMD/elements/table.py b/BetterMD/elements/table.py index 6576662..edbf693 100644 --- a/BetterMD/elements/table.py +++ b/BetterMD/elements/table.py @@ -1,393 +1,284 @@ -from .symbol import Symbol, List +from .symbol import Symbol from ..markdown import CustomMarkdown from ..rst import CustomRst from .h import H1, H2, H3, H4, H5, H6 from .text import Text -import logging -import typing as t +import itertools as it -if t.TYPE_CHECKING: - # Wont be imported at runtime - import pandas as pd # If not installed, will not affedt anything at runtime - -logger = logging.getLogger("BetterMD") - -class TrMD(CustomMarkdown['Tr']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - logger.debug("Converting Tr element to Markdown") - contents = "\n".join([e.to_md() for e in inner]) - split_content = contents.splitlines() - logger.debug(f"Split content: {split_content}") - ret = f"| {" | ".join(split_content)} |" - return ret - - -class THeadMD(CustomMarkdown['THead']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - md = [] - for child in symbol.head.children: - e = child.to_md() - - md.append({"len":len(e), "style":child.styles.get("text-align", "justify")}) - - def parse_md(data: 'dict') -> 'str': - start = " :" if data["style"] in ["left", "center"] else " " - middle = "-"*(data["len"]-2) if data["style"] == "center" else "-"*(data["len"]-1) if data["style"] in ["left", "right"] else "-"*(data["len"]) - end = ": " if data["style"] in ["right", "center"] else " " - - return f"{start}{middle}{end}" - - return f"{inner[0].to_md()}\n|{"|".join([parse_md(item) for item in md])}|" +class TableMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Convert table sections to a Markdown formatted string. -class TBodyMD(CustomMarkdown['TBody']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - content = [e.to_md() for e in inner if isinstance(e, Tr)] - logger.debug(f"TBody conent: {content}") - return "\n".join(content) - -class TdMD(CustomMarkdown['Td']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - if not pretty: - return " ".join([e.to_md() for e in inner]) - - length = len(max(symbol.table.cols[symbol.header], key=len).data) - logger.debug(f"Td length: {len(symbol)}") - logger.debug(f"Column length: {length}") - return " ".join([e.to_md() for e in inner]).center(length) - -class ThMD(CustomMarkdown['Th']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - if not pretty: - return " ".join([e.to_md() for e in inner]) + Iterates over the provided table sections, converting header sections (THead) and body + sections (TBody) using their respective to_md methods. The header is extracted from the + first encountered THead element, and all non-empty TBody outputs are combined with line + breaks to form the final Markdown table. + """ + result = [] + thead_content = "" + tbody_rows = [] - width = len(max(symbol.table.cols[symbol.header], key=len).data) - + # Process inner elements + for section in inner: + if isinstance(section, THead): + thead_content = section.to_md() + elif isinstance(section, TBody): + tbody_content = section.to_md() + if tbody_content: + tbody_rows.append(tbody_content) - if symbol.data == "": - return "".center(width) + # Combine all parts + if thead_content: + result.append(thead_content) - return f"**{" ".join([e.to_md() for e in inner]).center(width)}**" - -class TableMD(CustomMarkdown['Table']): - def to_md(self, inner, symbol, parent, pretty=True, **kwargs): - logger.debug("Converting Table element to Markdown") - head = symbol.head.to_md() if symbol.head else None - body = symbol.body.to_md() - - logger.debug(f"Table conversion complete. Has header: {head is not None}") - return f"{f"{head}\n" if head else ""}{body}" - - -class TableRST(CustomRst['Table']): - def to_rst(self, inner, symbol, parent, **kwargs): - logger.debug("Converting Table element to RST") - head = symbol.head.to_rst() if symbol.head else None - body = symbol.body.to_rst() - - return f"{f"{head}\n" if head else ""}{body}" - -class THeadRST(CustomRst['THead']): - def to_rst(self, inner, symbol, parent, **kwargs): - logger.debug("Converting THead element to RST") - logger.debug(f"THead has {len(inner)} children: {[e.to_rst() for e in inner]}") - top = [len(max(symbol.table.cols[child.header], key=len).data) for child in symbol.head.children] - content = "\n".join([e.to_rst() for e in inner]) - return f"+-{"-+-".join([t*"-" for t in top])}-+\n{content}\n+={"=+=".join([t*"=" for t in top])}=+" - -class TBodyRST(CustomRst['TBody']): - def to_rst(self, inner, symbol, parent, **kwargs): - bottom = [len(max(symbol.table.cols[child.header], key=len).data) for child in symbol.table.head.head.children] - return f'{f"\n+-{"-+-".join(["-"*b for b in bottom])}-+\n".join([e.to_rst() for e in inner if isinstance(e, Tr)])}\n+-{"-+-".join(["-"*b for b in bottom])}-+' - -class TrRST(CustomRst['Tr']): - def to_rst(self, inner, symbol, parent, **kwargs): - return f'| {" |\n| ".join(" | ".join([e.to_rst() for e in inner]).split("\n"))} |' - - -class TdRST(CustomRst['Td']): - def to_rst(self, inner, symbol, parent, **kwargs): - content = " ".join([e.to_rst() for e in inner]) - width = len(max(symbol.table.cols[symbol.header], key=len).data) - return content.center(width) + if tbody_rows: + result.append("\n".join(tbody_rows)) + + return "\n".join(result) + +class TableRST(CustomRst): + def to_rst(self, inner, symbol, parent): + """ + Convert table sections to a reStructuredText formatted table. + + Iterates over provided table header (THead) and body (TBody) sections to compute + the maximum column widths based on cell content. Constructs a table with a top + border, formatted rows with left-aligned cells, and row separators (using "=" + for header rows and "-" for others). Returns an empty string if no valid sections + or rows are found. + """ + if not inner: + return "" + + # First pass: collect all cell widths from both thead and tbody + col_widths = [] + all_rows = [] + + for section in inner: + if isinstance(section, THead) or isinstance(section, TBody): + for row in section.children: + cells = [cell.to_rst() for cell in row.children] + all_rows.append((cells, isinstance(section, THead))) + + # Update column widths + if not col_widths: + col_widths = [len(cell) for cell in cells] + else: + col_widths = [max(old, len(new)) for old, new in zip(col_widths, cells + [''] * (len(col_widths) - len(cells)))] + + if not all_rows: + return "" + + # Second pass: generate RST with consistent widths + result = [] + + # Top border + top_border = "+" + "+".join(["-" * (width + 2) for width in col_widths]) + "+" + result.append(top_border) + + for i, (cells, is_header) in enumerate(all_rows): + # Create row with proper spacing using consistent column widths + row = "| " + " | ".join(cell.ljust(width) for cell, width in zip(cells, col_widths)) + " |" + result.append(row) + + # Add separator after each row + if is_header: + separator = "+" + "+".join(["=" * (width + 2) for width in col_widths]) + "+" + else: + separator = "+" + "+".join(["-" * (width + 2) for width in col_widths]) + "+" + result.append(separator) + + return "\n".join(result) -class ThRST(CustomRst['Th']): - def to_rst(self, inner, symbol, parent, **kwargs): - content = " ".join([e.to_rst() for e in inner]) - width = len(max(symbol.table.cols[symbol.header], key=len).data) - if content == "": - return "".center(width) - return f"**{content}**".center(width) +class THeadMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Generate Markdown table rows with aligned columns. + + Iterates over each row in the input by converting its cell elements to Markdown, + calculates the maximum width for each column, and constructs the table rows using + pipe delimiters. A separator row of dashes matching each column's width is appended. + Returns an empty string if the input list is empty. + + Args: + inner: A list of row elements, each having a 'children' attribute with cell items. + + Returns: + A string containing the formatted Markdown table. + """ + if not inner: + return "" + + rows = [] + widths = [] + + # First pass: collect all rows and calculate column widths + for row in inner: + row_cells = [cell.to_md() for cell in row.children] + if not widths: + widths = [len(cell) for cell in row_cells] + else: + widths = [max(old, len(new)) for old, new in zip(widths, row_cells)] + rows.append(row_cells) + + if not rows: + return "" + + # Second pass: generate properly formatted markdown + result = [] + for row_cells in rows: + row = "|" + "|".join(row_cells) + "|" + result.append(row) + + # Add separator row + separator = "|" + "|".join(["-" * width for width in widths]) + "|" + result.append(separator) + + return "\n".join(result) +class THeadRST(CustomRst): + def to_rst(self, inner, symbol, parent): + # This is now handled by TableRST + """Return an empty string. + + The conversion of the table body to reStructuredText is handled by the TableRST class. + """ + return "" + +class TBodyMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Converts a list of row elements to a Markdown string. + + Iterates over each element in the provided list by calling its `to_md()` + method and joins the resulting Markdown rows with newline characters. + Returns an empty string if the list is empty. + """ + if not inner: + return "" + + rows = [] + for row in inner: + rows.append(row.to_md()) + + return "\n".join(rows) +class TrMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Converts a list of cell objects into a Markdown-formatted table row. + + Each cell is processed using its `to_md` method and the resulting strings are + joined using the pipe character (|) as a delimiter, with a leading and trailing + pipe added to conform to Markdown table row syntax. + + Args: + inner: List of cell objects to convert. + + Returns: + A string representing the Markdown-formatted table row. + """ + cells = [cell.to_md() for cell in inner] + return f"|{'|'.join(cells)}|" + +class TrRST(CustomRst): + def to_rst(self, inner, symbol, parent): + # This is now handled by TableRST + """ + Delegates header conversion to TableRST. + + This method returns an empty string because header processing is handled by + the TableRST class. + """ + return "" + +class TdMD(CustomMarkdown): + def to_md(self, inner, symbol, parent): + """ + Converts a list of elements to a Markdown string. + + Iterates over each element in the provided list, calls its to_md method, and + joins the resulting strings with a space. + + Args: + inner: A list of elements supporting a to_md conversion. + symbol: The current symbol context (unused). + parent: The parent symbol context (unused). + + Returns: + A string containing the Markdown representations of the inner elements. + """ + return " ".join([e.to_md() for e in inner]) + +class TdRST(CustomRst): + def to_rst(self, inner: list[Symbol], symbol: Symbol, parent: Symbol) -> str: + """ + Converts inner symbols to their reStructuredText representation. + + Returns an empty string if no inner symbols are provided. If a single element of type Text or a heading (H1–H6) + is present, its reStructuredText conversion is returned. Otherwise, the reStructuredText representations of all + inner symbols are joined together with spaces as a fallback. + """ + if not inner: + return "" + + if len(inner) > 1 or not isinstance(inner[0], (Text, H1, H2, H3, H4, H5, H6)): + return " ".join([e.to_rst() for e in inner]) # Fallback to join instead of raising error + return inner[0].to_rst() + +class ThRST(CustomRst): + def to_rst(self, inner, symbol, parent): + """ + Convert inner elements to their reStructuredText (RST) representations. + + Iterates over each element in the provided list, converts it to its RST format, and + joins the results with a space. The 'symbol' and 'parent' parameters are included for + interface consistency. + + Returns: + A string containing the RST representations of the inner elements. + """ + return " ".join([e.to_rst() for e in inner]) + +class TBodyRST(CustomRst): + def to_rst(self, inner, symbol, parent): + # This is now handled by TableRST + """ + Returns an empty string. + + This method acts as a placeholder for table body conversion to reStructuredText, + since the complete formatting is handled by the TableRST class. + """ + return "" class Table(Symbol): html = "table" md = TableMD() rst = TableRST() - head:'THead' = None - body:'TBody' = None - - cols: 'dict[Th, list[Td]]' = {} - headers: 'list[Th]' = [] - - def to_pandas(self): - if not self.prepared: - self.prepare() - - logger.debug("Converting Table to pandas DataFrame") - try: - import pandas as pd - df = pd.DataFrame([e.to_pandas() for e in self.body.children], columns=self.head.to_pandas()) - logger.debug(f"Successfully converted table to DataFrame with shape {df.shape}") - return df - except ImportError: - logger.error("pandas not installed - tables extra required") - raise ImportError("`tables` extra is required to use `to_pandas`") - except Exception as e: - logger.error(f"Error converting table to pandas: {str(e)}") - raise - - @classmethod - def from_pandas(cls, df:'pd.DataFrame'): - logger.debug(f"Creating Table from pandas DataFrame with shape {df.shape}") - try: - import pandas as pd - self = cls() - head = THead.from_pandas(list(df.columns)) - body = TBody.from_pandas(df) - - self.head = head - self.body = body - - self.add_child(head) - self.add_child(body) - - logger.debug("Successfully created Table from DataFrame") - logger.debug(f"Table has {len(self.head.children)} columns and {len(self.body.children)} rows with shape {df.shape}") - logger.debug(f"Table head: {self.head.to_pandas()}") - logger.debug(f"Table body: {[e.to_list() for e in self.body.children]}") - return self - except ImportError: - logger.error("pandas not installed - tables extra required") - raise ImportError("`tables` extra is required to use `from_pandas`") - except Exception as e: - logger.error(f"Error creating table from pandas: {str(e)}") - raise - - def prepare(self, parent = None, *args, **kwargs): - return super().prepare(parent, table=self, *args, **kwargs) - -class THead(Symbol): - html = "thead" - rst = THeadRST() - md = THeadMD() - - table:'Table' = None - children:'List[Tr]' = List() - - head:'Tr' = None - - - def to_pandas(self) -> 'list[str]': - return self.to_list() - - def to_list(self) -> 'list[str]': - if not self.prepared: - self.prepare() - - return self.children[0].to_list() - - @classmethod - def from_pandas(cls, data:'list[str]'): - return cls.from_list(data) - - @classmethod - def from_list(cls, data:'list[str]'): - self = cls() - tr = Tr.from_list(data) - self.add_child(tr) - - return self - - def prepare(self, parent = None, table=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.table.head = self - return super().prepare(parent, table=table, head=True, *args, **kwargs) - -class TBody(Symbol): - html = "tbody" - rst = TBodyRST() - md = TBodyMD() - - table:'Table' = None - children:'List[Tr]' = List() - - def to_pandas(self): - if not self.prepared: - self.prepare() - - logger.debug("Converting TBody to pandas format") - data = [e.to_pandas() for e in self.children] - logger.debug(f"Converted {len(data)} rows from TBody") - return data - - @classmethod - def from_pandas(cls, df:'pd.DataFrame'): - logger.debug(f"Creating TBody from DataFrame with {len(df)} rows") - try: - import pandas as pd - self = cls() - - for i, row in df.iterrows(): - tr = Tr.from_pandas(row) - self.children.append(tr) - logger.debug(f"Added row {i} to TBody") - - return self - except ImportError: - logger.error("pandas not installed - tables extra required") - raise ImportError("`tables` extra is required to use `from_pandas`") - - def prepare(self, parent = None, table=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.table.body = self - return super().prepare(parent, table=table, head=False, *args, **kwargs) + nl = True class Tr(Symbol): html = "tr" md = TrMD() rst = TrRST() - table:'Table' = None - - children:'List[t.Union[Td, Th]]' = List() - - def __init__(self, styles = {}, classes = [], dom = True, inner = [], **props): - super().__init__(styles, classes, dom, inner, **props) - - self.is_header = False - if isinstance(self.parent, THead): - self.is_header = True - logger.debug("Tr element identified as header row") - - def to_pandas(self): - if not self.prepared: - self.prepare() - - def get(o, f): - return [getattr(v, f) for v in o] - - try: - import pandas as pd - if self.is_header: - raise ValueError("This `Tr` is a header row and cannot be converted to a pandas `Series`") - return pd.Series({h.data: v.data for h, v in zip(self.table.head.head.children, self.children)}, index=self.table.head.to_pandas()) - - except ImportError: - raise ImportError("`tables` extra is required to use `to_pandas`") - - def to_list(self): - if not self.prepared: - self.prepare() - - return [e.data for e in self.children] - - @classmethod - def from_pandas(cls, series:'pd.Series'): - try: - import pandas as pd - self = cls() - self.children.clear() - for v in series: - td = Td(inner=[Text(v)]) - self.children.append(td) - - return self - except ImportError: - raise ImportError("`tables` extra is required to use `from_pandas`") - - @classmethod - def from_list(cls, data:'list[str]'): - self = cls() - for value in data: - td = Td(inner=[Text(value)]) - self.children.append(td) - - return self - - def prepare(self, parent = None, table=None, head=False, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - if head: self.table.head.head = self - return super().prepare(parent, table=table, row=self, *args, **kwargs) - class Td(Symbol): html = "td" md = TdMD() rst = TdRST() - children:'List[Text]' = List() - row:'Tr' = None - - @property - def data(self): - return self.children.get(0, Text("")).text - - @property - def width(self): - return len(self.data) - - def prepare(self, parent = None, table=None, row=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.row = row - - self.header = self.table.headers[self.row.children.index(self)] - self.table.cols[self.header].append(self) - return super().prepare(parent, table=table, *args, **kwargs) - - def __len__(self): - return len(self.data) - class Th(Symbol): html = "th" - md = ThMD() + md = TdMD() rst = ThRST() - children:'List[Text]' = List() - row:'Tr' = None - - def __init__(self, styles: dict[str, str] = {}, classes: list[str] = [], dom: bool = True, inner: list[Symbol] = [], **props): - super().__init__(styles, classes, dom, inner, **props) - - @property - def data(self): - contents = self.children.get(0, Text("")).text - logger.debug(f"Th data: {contents}") - if contents == "": - logger.debug("Th data is empty") - return "" - logger.debug("Th data is not empty") - return f"**{contents}**" - - @property - def width(self): - """Width of the data""" - if self.data == "": - return 0 - return len(self.data)-4 +class THead(Symbol): + html = "thead" + md = THeadMD() + rst = THeadRST() - def prepare(self, parent = None, table=None, row=None, *args, **kwargs): - assert isinstance(table, Table) - self.table = table - self.row = row - self.header = self - self.table.headers.append(self) - self.table.cols[self] = [self] - return super().prepare(parent, table=table, *args, **kwargs) - - def __len__(self): - """Width of the element (data + bolding)""" - return len(self.data) \ No newline at end of file +class TBody(Symbol): + html = "tbody" + md = TBodyMD() + rst = TBodyRST() \ No newline at end of file diff --git a/BetterMD/elements/text.py b/BetterMD/elements/text.py index f09900c..86769be 100644 --- a/BetterMD/elements/text.py +++ b/BetterMD/elements/text.py @@ -2,33 +2,58 @@ from ..markdown import CustomMarkdown from ..html import CustomHTML -import typing as t - -class Str(t.Protocol): - def __str__(self) -> str: ... - # This is not equivelant to the html span or p tags but instead just raw text class Text(Symbol): - md = "{t}" - html = "{t}" - rst = "{t}" - - def __init__(self, text:'Str', dom = True, **props): - self.text = str(text) - return super().__init__(dom=dom, **props) - - def to_html(self) -> 'str': + md = "text" + html = "text" + rst = "text" + + def __init__(self, text:str, **props): + """ + Initialize a Text instance with the provided content. + + Assigns the given text to the instance and forwards any additional keyword + arguments to the parent Symbol constructor. + + Args: + text: The content to be rendered. + **props: Additional properties for further configuration. + """ + self.text = text + return super().__init__(**props) + + def to_html(self, indent=0, parent=None): + """ + Return the HTML representation of the text with applied indentation. + + This method formats the stored text by prepending a specified number of four-space + indentation levels. The 'parent' parameter is accepted for interface compatibility + but is currently not used. + + Args: + indent: Number of indentation levels to apply (each level equals four spaces). + parent: Reserved for future use. + + Returns: + A string containing the indented text. + """ + return f"{' '*indent}{self.text}" + + def to_md(self): + """ + Return the text content as Markdown. + + Returns: + str: The original text. + """ return self.text - def to_md(self) -> 'str': + def to_rst(self): + """ + Returns the text as reStructuredText. + + This method returns the raw text stored in the instance, intended for use in reStructuredText output. + """ return self.text - - def to_rst(self) -> 'str': - return self.text - - def __str__(self): - return f"{self.text}" - - __repr__ = __str__ \ No newline at end of file diff --git a/BetterMD/elements/title.py b/BetterMD/elements/title.py new file mode 100644 index 0000000..b97b491 --- /dev/null +++ b/BetterMD/elements/title.py @@ -0,0 +1,57 @@ +from typing import Text +from .symbol import Symbol +from ..markdown import CustomMarkdown +from ..rst import CustomRst +from .text import Text + +class MD(CustomMarkdown): + def to_md(self, inner: list[Symbol], symbol: Symbol, parent: Symbol, **kwargs) -> str: + """ + Converts a title element to its Markdown representation. + + This method expects the 'inner' parameter to be a list containing exactly one + Text element. The title text is converted using its to_md() method and formatted + into a Markdown title declaration. A ValueError is raised if 'inner' does not meet + the expected criteria. + + Args: + inner: A list of Symbol objects that must contain a single Text instance. + + Returns: + A string representing the title in Markdown format. + + Raises: + ValueError: If 'inner' does not contain exactly one Text element. + """ + if not isinstance(inner[0], Text) or len(inner) != 1: + raise ValueError("Title element must contain a single Text element") + + return f'title: "{inner[0].to_md()}"' + +class RST(CustomRst): + def to_rst(self, inner: list[Symbol], symbol: Symbol, parent: Symbol, **kwargs) -> str: + """ + Convert a title element to reStructuredText format. + + Validates that the provided list contains exactly one Text element, raising a + ValueError if this condition is not met. + + Args: + inner: A list of Symbol objects representing the title content, which must + consist of a single Text element. + + Returns: + A reStructuredText formatted title string prefixed with ":title: ". + """ + if not isinstance(inner[0], Text) or len(inner) != 1: + raise ValueError("Title element must contain a single Text element") + + return f":title: {inner[0].to_rst()}" + + +class Title(Symbol): + html = "title" + md = MD() + rst = RST() + + diff --git a/BetterMD/html/custom_html.py b/BetterMD/html/custom_html.py index 2ba3b19..b4365b8 100644 --- a/BetterMD/html/custom_html.py +++ b/BetterMD/html/custom_html.py @@ -1,13 +1,41 @@ import typing as t +from abc import ABC, abstractmethod if t.TYPE_CHECKING: from ..elements.symbol import Symbol T = t.TypeVar("T", default='Symbol') -class CustomHTML(t.Generic[T]): - def to_html(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol') -> str: ... +class CustomHTML(t.Generic[T], ABC): + @abstractmethod + def to_html(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol') -> str: """ +Generate an HTML string for a given symbol structure. - def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol', *args, **kwargs) -> 'list[Symbol]': ... +This abstract method must be implemented by subclasses to convert a symbol, +its inner content, and its parent context into an HTML representation. + +Args: + inner: A list of symbols representing the inner content. + symbol: The symbol instance to be rendered. + parent: The parent symbol providing contextual information. + +Returns: + The HTML string representation of the given symbol. +""" +... + + def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol'):""" +Prepares symbols for HTML generation. + +This method serves as a hook for pre-processing the provided symbol and its inner +content before HTML conversion. Subclasses can override this method to implement +custom preparatory behavior. By default, it performs no operations. + +Args: + inner: A list of Symbol objects representing nested elements. + symbol: A symbol instance to be prepared for HTML processing. + parent: The parent Symbol providing contextual reference. +""" +... def verify(self, text) -> bool: ... \ No newline at end of file diff --git a/BetterMD/markdown/custom_markdown.py b/BetterMD/markdown/custom_markdown.py index db4e535..1b9ca13 100644 --- a/BetterMD/markdown/custom_markdown.py +++ b/BetterMD/markdown/custom_markdown.py @@ -1,16 +1,53 @@ import typing as t +from abc import ABC, abstractmethod if t.TYPE_CHECKING: from ..elements.symbol import Symbol T = t.TypeVar("T", default='Symbol') -class CustomMarkdown(t.Generic[T]): +class CustomMarkdown(t.Generic[T], ABC): prop = "" md: 'dict[str, str]' = {} - def to_md(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol', **kwargs) -> 'str': ... + @abstractmethod + def to_md(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol') -> str: """ +Convert a symbol structure to its markdown representation. - def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol', *args, **kwargs) -> 'list[Symbol]': ... +Subclasses must implement this method to produce a markdown formatted string using the +provided inner symbols, target symbol, and contextual parent symbol. - def verify(self, text) -> 'bool': ... \ No newline at end of file +Args: + inner: A list of Symbol instances representing nested content. + symbol: The symbol to be converted. + parent: The parent Symbol providing contextual hierarchy. + +Returns: + A markdown string corresponding to the symbol conversion. +""" +... + + def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol'): """ +Performs preparation before markdown conversion. + +This placeholder method can be overridden to preprocess or modify the provided +symbol data and context before the actual markdown generation. By default, it +performs no action. + +Args: + inner (list[Symbol]): List of symbol objects representing nested content. + symbol (T): The target symbol for markdown conversion. + parent (Symbol): The parent symbol providing contextual information. +""" +... + + def verify(self, text) -> bool: """ +Verify the provided markdown text. + +Args: + text: The markdown text to validate. + +Returns: + bool: True if the text conforms to the expected markdown format, otherwise False. +""" +... diff --git a/BetterMD/parse/collection.py b/BetterMD/parse/collection.py new file mode 100644 index 0000000..a24372a --- /dev/null +++ b/BetterMD/parse/collection.py @@ -0,0 +1,65 @@ +import typing as t +import logging +from ..html import CustomHTML + +if t.TYPE_CHECKING: + from ..elements import Symbol + +class Collection: + def __init__(self, *symbols:'type[Symbol]'): + """ + Initializes a Collection instance with optional Symbol objects. + + Stores the provided symbols in a list and sets up a logger for diagnostic purposes. + + Args: + *symbols: One or more Symbol instances to include in the collection. + """ + self.symbols = list(symbols) + self.logger = logging.getLogger("BetterMD") + + def add_symbols(self, symbol:'type[Symbol]'): + """ + Appends a symbol to the collection. + + Args: + symbol: A Symbol instance to add to the collection. + """ + self.symbols.append(symbol) + + def remove_symbol(self, symbol:'type[Symbol]'): + """ + Remove a symbol from the collection. + + Removes the specified symbol from the internal list of symbols. If the symbol is not present, a ValueError is raised. + """ + self.symbols.remove(symbol) + + def find_symbol(self, name:'str', raise_errors:'bool'=False) -> 't.Union[None, type[Symbol]]': + """ + Searches for a symbol with a matching HTML representation. + + Iterates over the collection's symbols and returns the first symbol whose HTML is + either a string that exactly matches the given name or, if a CustomHTML instance, + successfully verifies the name. If no match is found and raise_errors is True, a + ValueError is raised; otherwise, None is returned. + + Args: + name: The symbol name to match against the HTML representation. + raise_errors: If True, raises an error when no matching symbol is found. + + Returns: + The matching symbol if found, or None if not found and raise_errors is False. + + Raises: + ValueError: If no matching symbol is found and raise_errors is set to True. + """ + for symbol in self.symbols: + if isinstance(symbol.html, str) and symbol.html == name: + return symbol + elif isinstance(symbol.html, CustomHTML) and symbol.html.verify(name): + return symbol + + if raise_errors: + raise ValueError(f"Symbol `{name}` not found in collection, if using default symbols it may not be supported.") + return None \ No newline at end of file diff --git a/BetterMD/parse/html.py b/BetterMD/parse/html.py new file mode 100644 index 0000000..1f3e301 --- /dev/null +++ b/BetterMD/parse/html.py @@ -0,0 +1,274 @@ +from .typing import ELEMENT +import typing as t + +class HTMLParser: + def __init__(self): + """ + Initialize an HTMLParser instance. + + Resets the parser's internal state to prepare for parsing operations. + """ + self.reset() + + def reset(self): + """ + Resets the parser state to its initial configuration. + + Clears the DOM, current tag, text buffer, attribute name, and tag stack, and resets the state to process text. + """ + self.current_tag:'t.Optional[ELEMENT]' = None + self.dom = [] + self.state = 'TEXT' + self.buffer = '' + self.attr_name = '' + self.tag_stack = [] + + def parse(self, html:'str') -> 'list[ELEMENT]': + """ + Parse an HTML string and construct a DOM representation. + + Resets the parser's internal state and processes the input HTML one character at a time, + transitioning between states to support text nodes, opening tags, self-closing tags, + closing tags, and attributes. The method builds a nested DOM structure from the HTML + and returns it as a list of elements. + + Args: + html: The HTML string to parse. + + Returns: + list[ELEMENT]: A list representing the parsed DOM elements. + """ + self.reset() + + i = 0 + while i < len(html): + char = html[i] + + if self.state == 'TEXT': + if char == '<': + if self.buffer.strip(): + self.handle_text(self.buffer) + self.buffer = '' + self.state = 'TAG_START' + else: + self.buffer += char + + elif self.state == 'TAG_START': + if char == '/': + self.state = 'CLOSING_TAG' + elif char == '!': + self.state = 'COMMENT_OR_DOCTYPE' + self.buffer = '!' + else: + self.state = 'TAG_NAME' + self.buffer = char + + elif self.state == 'TAG_NAME': + if char.isspace(): + self.current_tag = {"type": "element", 'name': self.buffer, 'attributes': {}, 'children': []} + self.buffer = '' + self.state = 'BEFORE_ATTRIBUTE_NAME' + elif char == '>': + self.current_tag = {"type": "element", 'name': self.buffer, 'attributes': {}, 'children': []} + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.current_tag = {"type": "element", 'name': self.buffer, 'attributes': {}, 'children': []} + self.state = 'SELF_CLOSING_TAG' + else: + self.buffer += char + + elif self.state == 'BEFORE_ATTRIBUTE_NAME': + if char.isspace(): + pass + elif char == '>': + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.state = 'SELF_CLOSING_TAG' + else: + self.attr_name = char + self.state = 'ATTRIBUTE_NAME' + + elif self.state == 'ATTRIBUTE_NAME': + if char.isspace(): + self.current_tag['attributes'][self.attr_name] = '' + self.state = 'AFTER_ATTRIBUTE_NAME' + elif char == '=': + self.state = 'BEFORE_ATTRIBUTE_VALUE' + elif char == '>': + self.current_tag['attributes'][self.attr_name] = '' + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.current_tag['attributes'][self.attr_name] = '' + self.state = 'SELF_CLOSING_TAG' + else: + self.attr_name += char + + elif self.state == 'AFTER_ATTRIBUTE_NAME': + if char.isspace(): + pass + elif char == '=': + self.state = 'BEFORE_ATTRIBUTE_VALUE' + elif char == '>': + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.state = 'SELF_CLOSING_TAG' + else: + self.current_tag['attributes'][self.attr_name] = '' + self.attr_name = char + self.state = 'ATTRIBUTE_NAME' + + elif self.state == 'BEFORE_ATTRIBUTE_VALUE': + if char.isspace(): + pass + elif char == '"': + self.buffer = '' + self.state = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED' + elif char == "'": + self.buffer = '' + self.state = 'ATTRIBUTE_VALUE_SINGLE_QUOTED' + elif char == '>': + self.current_tag['attributes'][self.attr_name] = '' + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + else: + self.buffer = char + self.state = 'ATTRIBUTE_VALUE_UNQUOTED' + + elif self.state == 'ATTRIBUTE_VALUE_DOUBLE_QUOTED': + if char == '"': + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'AFTER_ATTRIBUTE_VALUE_QUOTED' + else: + self.buffer += char + + elif self.state == 'ATTRIBUTE_VALUE_SINGLE_QUOTED': + if char == "'": + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'AFTER_ATTRIBUTE_VALUE_QUOTED' + else: + self.buffer += char + + elif self.state == 'ATTRIBUTE_VALUE_UNQUOTED': + if char.isspace(): + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'BEFORE_ATTRIBUTE_NAME' + elif char == '>': + self.current_tag['attributes'][self.attr_name] = self.buffer + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + elif char == '/': + self.current_tag['attributes'][self.attr_name] = self.buffer + self.buffer = '' + self.state = 'SELF_CLOSING_TAG' + else: + self.buffer += char + + elif self.state == 'AFTER_ATTRIBUTE_VALUE_QUOTED': + if char.isspace(): + self.state = 'BEFORE_ATTRIBUTE_NAME' + elif char == '/': + self.state = 'SELF_CLOSING_TAG' + elif char == '>': + self.handle_tag_open(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + else: + self.state = 'BEFORE_ATTRIBUTE_NAME' + i -= 1 # Reconsider this character + + elif self.state == 'SELF_CLOSING_TAG': + if char == '>': + self.handle_tag_self_closing(self.current_tag) + self.buffer = '' + self.state = 'TEXT' + else: + # Error handling + pass + + elif self.state == 'CLOSING_TAG': + if char == '>': + self.handle_tag_close(self.buffer) + self.buffer = '' + self.state = 'TEXT' + else: + self.buffer += char + + # Additional states would be implemented here + + i += 1 + + # Handle any remaining text + if self.state == 'TEXT' and self.buffer.strip(): + self.handle_text(self.buffer) + + return self.dom + + def handle_tag_open(self, tag): + """ + Adds an open tag to the DOM and updates the tag stack. + + If a tag is already open, appends the new tag as a child of the current tag; otherwise, it is added + to the root of the DOM. The new tag is then pushed onto the tag stack. + """ + if len(self.tag_stack) > 0: + self.tag_stack[-1]['children'].append(tag) + else: + self.dom.append(tag) + + self.tag_stack.append(tag) + + def handle_tag_self_closing(self, tag): + """ + Adds a self-closing tag node to the DOM. + + If an open tag exists in the tag stack, appends the self-closing tag as a child of the most recent tag; otherwise, adds it to the root DOM. + """ + if len(self.tag_stack) > 0: + self.tag_stack[-1]['children'].append(tag) + else: + self.dom.append(tag) + + def handle_tag_close(self, tag_name): + """ + Closes the open tag if its name matches the provided tag name. + + If the last element in the tag stack has a name equal to tag_name, the tag is removed, + effectively closing the tag in the DOM structure. + """ + if len(self.tag_stack) > 0 and self.tag_stack[-1]['name'] == tag_name: + self.tag_stack.pop() + + def handle_text(self, text): + """ + Adds a text node to the appropriate parent in the DOM. + + If there is an open tag on the stack, the text node is appended as a child of that tag; + otherwise, it is added to the top-level DOM list. + """ + text_node = {'type': 'text', 'content': text, 'name': 'text'} + if len(self.tag_stack) > 0: + self.tag_stack[-1]['children'].append(text_node) + else: + self.dom.append(text_node) + + def get_dom(self): + """ + Retrieves the constructed DOM representation. + + Returns: + list: A list representing the parsed Document Object Model. + """ + return self.dom \ No newline at end of file diff --git a/BetterMD/parse/markdown.py b/BetterMD/parse/markdown.py new file mode 100644 index 0000000..028bca8 --- /dev/null +++ b/BetterMD/parse/markdown.py @@ -0,0 +1,528 @@ +import re +import typing as t +from .typing import ELEMENT, TEXT +import threading as th + +class MDParser: + + top_level_tags = { + "blockquote": r"^> (.+)$", # Blockquote + "br": r"\n\n", # Br + "code": r"^```([A-Za-z]*)[^.](?:([^`]*)[^.])?```$", # Code block + + "h": r"^(#{1,6})(?: (.*))?$", + + "hr": r"^---+$", # Hr + + "ul" : r"^([ | ]*)(?:-|\+|\*)(?: (.*))?$", # Ul Li + "ol" : r"^([ | ]*)(\d)\.(?: (.*))?$", # Ol Li + + "tr": r"^\|(?:[^|\n]+\|)+$", # tr - must start and end with | and have at least one | + "thead": r"^\|(?::?-+:?\|)+$", # thead / tbody + + "title": r"^title: .+$", # Title + } + + def __init__(self): + """ + Initializes the MDParser instance. + + Sets up the parser to a clean initial state by calling reset(). + """ + self.reset() + + def reset(self): + """ + Resets the parser's internal state. + + Clears the document object model (DOM), text buffer, list stack, and DOM stack to prepare + for a new markdown parsing operation. + """ + self.dom = [] + self.buffer = '' + self.list_stack = [] + self.dom_stack = [] + + def create_element(self, name:'str', attrs:'dict[str, str]'=None, children:'list[ELEMENT|TEXT]'=None) -> 'ELEMENT': + """ + Creates a new element node for the Markdown DOM. + + Constructs and returns a dictionary representing an element with a given tag name, + optional attributes, and child nodes. If the attributes or children are not provided, + they default to an empty dictionary or list, respectively. + + Args: + name (str): The name of the element. + attrs (dict[str, str], optional): A mapping of attribute names to their values. + children (list[ELEMENT|TEXT], optional): A list of child nodes for the element. + + Returns: + ELEMENT: A dictionary representing the constructed element node. + """ + if children is None: + children = [] + + if attrs is None: + attrs = {} + + return { + "type": "element", + "name": name, + "attributes": attrs, + "children": children + } + + def create_text(self, content:'str') -> 'TEXT': + """ + Creates a text node element. + + Constructs and returns a dictionary representing a text node for the DOM. The + element includes its type, name, and provided text content. + + Args: + content: The text content for the node. + + Returns: + A dictionary with keys "type", "content", and "name" representing the text node. + """ + return { + "type": "text", + "content": content, + "name": "text" + } + + def end_block(self): + # Create paragraph from buffered text + """ + Finalizes the current text block as a paragraph. + + If there is any buffered text, the method trims whitespace and, if non-empty, + creates a paragraph element with the text, appending it to the document model. + Finally, it clears the text buffer. + """ + if self.buffer: + text = self.buffer.strip() + if text: + para = self.create_element("p", children=[self.create_text(text)]) + self.dom.append(para) + self.buffer = '' + + def start_block(self): + """ + Placeholder for starting a new block. + + This method currently performs no action and is reserved for future use. + """ + pass + + def handle_blockquote(self, text: 'list[str]', i): + """ + Processes blockquote lines from the Markdown text and appends the resulting element to the DOM. + + This method scans the list of Markdown lines beginning at index i, removing + leading blockquote markers (">" or "> ") and joining consecutive lines into paragraphs. + Empty lines trigger paragraph breaks. Lines that do not match any known top-level tag + are treated as continuations of the blockquote. The accumulated text is then recursively + parsed, and the resulting structure is set as the children of a new blockquote element, + which is added to the DOM. + + Args: + text: A list of Markdown lines. + i: The starting index for blockquote processing. + + Returns: + The number of lines consumed during the blockquote processing. + """ + elm = self.create_element("blockquote") + new_text = [] + current_line = [] + + for line in text[i:]: + if re.match(self.top_level_tags["blockquote"], line): + # Remove blockquote marker and add to current line + content = line.removeprefix("> ").removeprefix(">").strip() + if content: + current_line.append(content) + elif line.strip() == "": + # Empty line marks paragraph break + if current_line: + new_text.append(" ".join(current_line)) + new_text.append("") + current_line = [] + elif not any(re.match(pattern, line) for pattern in self.top_level_tags.values()): + # Continuation of blockquote without marker + current_line.append(line.strip()) + else: + break + + if current_line: + new_text.append(" ".join(current_line)) + + # Parse blockquote content recursively + elm["children"] = MDParser().parse("\n".join(new_text)) + self.dom.append(elm) + + return len(new_text) - 1 + + def handle_code(self, text: 'list[str]'): + """ + Processes a Markdown code block and appends it to the DOM. + + Finalizes any pending text block, extracts the code block's language and content + using a regex pattern, and creates a preformatted element containing a code child. + The constructed element is then appended to the document model. + + Args: + text: A list of strings representing the lines of a Markdown code block. + + Returns: + The relative index of the closing code block delimiter within the joined text. + + Raises: + AssertionError: If the provided text does not match the expected code block pattern. + """ + self.end_block() + match = re.match(self.top_level_tags["code"], "\n".join(text)) + assert match is not None, "Code block not found" + + lang = match.group(1) + content = match.group(2) + + elm = self.create_element("pre", children=[self.create_element("code", {"language": lang}, [self.create_text(content)])]) + self.dom.append(elm) + + return "\n".join(text)["\n".join(text).index("```"):].index("```") + + + def handle_br(self, text: 'list[str]'): + """ + Finalizes the current block and inserts a line break if two consecutive blank lines are found. + + This function ends any ongoing text block and examines the first two entries in the provided + list. If both are empty strings, it appends a break element to the DOM to represent a line break + and returns 1. Otherwise, it returns 0. + """ + self.end_block() + if text[0] == "" and text[1] == "": + self.dom.append(self.create_element("br", {})) + return 1 + return 0 + + def handle_h(self, line: 'str'): + """ + Processes a Markdown header line and appends a corresponding header element to the DOM. + + This method finalizes the current text block and uses a regular expression to + determine the header level (based on the number of '#' characters) and extract + the header content. It then creates an HTML header element (e.g., h1, h2) and appends + it to the DOM. + + Args: + line: A Markdown header line (e.g., "# Header") to be processed. + + Raises: + AssertionError: If the provided line does not match the expected header format. + """ + self.end_block() + match = re.match(self.top_level_tags["h"], line) + assert match is not None, "Header not found" + + level = len(match.group(1)) + content = match.group(2) + + self.dom.append(self.create_element(f"h{level}", children=[self.create_text(content)])) + + def handle_hr(self, line: 'str'): + """ + Processes a horizontal rule. + + Finalizes the current text block and appends a horizontal rule element to the document. + """ + self.end_block() + self.dom.append(self.create_element("hr", {})) + + def handle_text(self, line: 'str'): + # Don't create text nodes for empty lines + """ + Processes a Markdown text line for paragraph buffering. + + If the line is empty or contains only whitespace, it invokes the line break handler. + Otherwise, the line is appended to the internal buffer for later paragraph creation. + """ + if not line.strip(): + self.handle_br(line) + return + + # Buffer text content for paragraph handling + if self.buffer: + self.buffer += '\n' + line + else: + self.buffer = line + + def handle_list(self, text: 'list[str]', i: int, indent_level: int = 0) -> int: + """ + Parses Markdown list items and appends the constructed list to the DOM. + + Starting from the provided index in a list of Markdown lines, this method identifies + whether an ordered or unordered list is present. It processes each list item, handles + nested lists recursively based on indentation, and converts list items into corresponding + DOM elements. The method updates the DOM with the parsed list and returns the number + of lines processed. + + Parameters: + text: A list of strings representing the Markdown content. + i: The starting index from which to begin parsing list items. + indent_level: The current indentation level to determine list nesting (default is 0). + + Returns: + The number of lines processed during the list parsing. + """ + if re.match(self.top_level_tags["ul"], text[i]): + list_elm = self.create_element("ul") + list_pattern = self.top_level_tags["ul"] + elif re.match(self.top_level_tags["ol"], text[i]): + list_elm = self.create_element("ol") + list_pattern = self.top_level_tags["ol"] + else: + return 0 + + current_item = [] + lines_processed = 0 + + while i + lines_processed < len(text): + line = text[i + lines_processed] + + if not line.strip(): + if current_item: + # Empty line in list item - treat as paragraph break + current_item.append("") + lines_processed += 1 + continue + + list_match = re.match(list_pattern, line) + if list_match: + indent = len(list_match.group(1)) + + if indent < indent_level: + # End of current list level + break + elif indent > indent_level: + # Nested list + nested_lines = lines_processed + self.handle_list(text[i + lines_processed:], 0, indent) + lines_processed += nested_lines + continue + + # Add previous item if exists + if current_item: + content = " ".join(current_item).strip() + if content: + list_elm["children"].append( + self.create_element("li", children=[self.create_text(content)]) + ) + + # Start new item + current_item = [list_match.group(2).strip()] + + elif not any(re.match(pattern, line) for pattern in self.top_level_tags.values()): + # Continuation of list item + current_item.append(line.strip()) + else: + break + + lines_processed += 1 + + # Add final item + if current_item: + content = " ".join(current_item).strip() + if content: + list_elm["children"].append( + self.create_element("li", children=[self.create_text(content)]) + ) + + self.dom.append(list_elm) + return lines_processed + + def handle_table(self, text: 'list[str]', i: int) -> int: + # First check if this is actually a table + # A proper table needs at least two rows (header and separator) + """ + Parses a Markdown table from the given lines and appends its HTML structure to the DOM. + + If a valid table is detected (requiring a header row and its corresponding separator), this method processes subsequent table rows to build header (thead) and body (tbody) sections. If the expected table format is not found, it treats the line as regular text instead. + + Args: + text: A list of Markdown lines. + i: The starting index in the text list where the table is expected. + + Returns: + The number of lines processed for the table. + """ + if i + 1 >= len(text) or not re.match(self.top_level_tags["thead"], text[i + 1]): + # Not a table, treat as regular text + self.handle_text(text[i]) + return 1 + + lines_processed = 0 + table = self.create_element("table") + thead = self.create_element("thead") + tbody = self.create_element("tbody") + current_section = thead + + while i + lines_processed < len(text): + line = text[i + lines_processed] + + if not line.strip(): + break + + if re.match(self.top_level_tags["thead"], line): + # Alignment row - skip it but switch to tbody + current_section = tbody + lines_processed += 1 + continue + + if re.match(self.top_level_tags["tr"], line): + # Process table row + row = self.create_element("tr") + cells = [cell.strip() for cell in line.strip('|').split('|')] + + for cell in cells: + if current_section == thead: + cell_type = "th" + else: + cell_type = "td" + + row["children"].append( + self.create_element(cell_type, children=[self.create_text(cell.strip())]) + ) + + current_section["children"].append(row) + lines_processed += 1 + else: + break + + if thead["children"]: + table["children"].append(thead) + if tbody["children"]: + table["children"].append(tbody) + + self.dom.append(table) + return lines_processed + + def handle_title(self, line: 'str'): + """ + Process a Markdown title line and set the document head element. + + Finalizes any open text block, validates that the line matches the expected title + pattern, and extracts the title content. A head element is then created with a nested + title element containing the extracted text. + + Args: + line: A Markdown-formatted string representing the document title. + + Raises: + AssertionError: If the title pattern is not found in the input line. + """ + self.end_block() + match = re.match(self.top_level_tags["title"], line) + assert match is not None, "Title not found" + + title = match.group(1) + self.head = self.create_element("head", children=[self.create_element("title", children=[self.create_text(title)])]) + + def parse(self, markdown: 'str') -> 'ELEMENT': + """ + Parses Markdown text into an HTML DOM structure. + + This method resets the internal parser state and processes the provided Markdown + text line by line, invoking specialized handlers for block-level elements such as + headers, blockquotes, code blocks, horizontal rules, lists, tables, titles, and line + breaks. Regular text lines are buffered into paragraphs until a block boundary is + encountered. The final result is a DOM element representing an HTML document with + a head and body containing the parsed content. + + Args: + markdown: A string containing Markdown-formatted text. + + Returns: + An element representing the HTML structure. + """ + self.reset() + lines = markdown.splitlines() + i = 0 + + while i < len(lines): + line = lines[i].strip() # Strip whitespace from each line + + # Empty line ends current block + if not line: + self.end_block() + i += 1 + continue + + # Check for block-level elements + if re.search(self.top_level_tags["h"], line): + self.end_block() + self.handle_h(line) + i += 1 + continue + + elif re.search(self.top_level_tags["blockquote"], line): + self.end_block() + lines_processed = self.handle_blockquote(lines, i) + i += lines_processed + 1 + continue + + elif re.search(self.top_level_tags["code"], "\n".join(lines[i:])): + self.end_block() + lines_processed = self.handle_code(lines[i:]) + i += lines_processed + 1 + continue + + elif re.search(self.top_level_tags["h"], line): + self.end_block() + self.handle_h(line) + i += 1 + continue + + elif re.search(self.top_level_tags["hr"], line): + self.end_block() + self.handle_hr(line) + i += 1 + continue + + elif re.search(self.top_level_tags["ul"], line) or re.search(self.top_level_tags["ol"], line): + self.end_block() + lines_processed = self.handle_list(lines, i) + i += lines_processed + continue + + elif re.search(self.top_level_tags["tr"], line): + self.end_block() + lines_processed = self.handle_table(lines, i) + i += lines_processed + continue + + elif re.search(self.top_level_tags["title"], line): + self.end_block() + self.handle_title(line) + i += 1 + continue + + elif re.search(self.top_level_tags["br"], line): + self.end_block() + lines_processed = self.handle_br(lines[i:]) + i += lines_processed + continue + + else: + # Regular text gets buffered for paragraph handling + self.handle_text(line) + i += 1 + + # End any remaining block + self.end_block() + + head = self.create_element("head") or self.head + body = self.create_element("body", children=self.dom) + + return self.create_element("html", children=[head, body]) \ No newline at end of file diff --git a/BetterMD/parse/typing.py b/BetterMD/parse/typing.py new file mode 100644 index 0000000..6736173 --- /dev/null +++ b/BetterMD/parse/typing.py @@ -0,0 +1,28 @@ +import typing as t + +class TEXT(t.TypedDict): + type: t.Literal["text"] + content: str + name: t.Literal["text"] + +class ELEMENT(t.TypedDict): + type: 't.Literal["element"]' + name: 'str' + attributes: 'dict[str, str]' + children: 'list[t.Union[ELEMENT, TEXT]]' + +@t.runtime_checkable +class Parser(t.Protocol): + def parse(self, html:'str') -> 'list[ELEMENT]': """ +Parse HTML markup into a list of structured HTML elements. + +This method converts the given HTML string into a hierarchical list of ELEMENT +objects representing both HTML elements and text nodes. + +Args: + html: The HTML content to parse. + +Returns: + A list of ELEMENT objects representing the parsed HTML structure. +""" +... \ No newline at end of file diff --git a/BetterMD/rst/custom_rst.py b/BetterMD/rst/custom_rst.py index c3fa565..8171312 100644 --- a/BetterMD/rst/custom_rst.py +++ b/BetterMD/rst/custom_rst.py @@ -1,16 +1,49 @@ import typing as t +from abc import ABC, abstractmethod if t.TYPE_CHECKING: from ..elements.symbol import Symbol T = t.TypeVar("T", default='Symbol') -class CustomRst(t.Generic[T]): +class CustomRst(t.Generic[T], ABC): prop = "" rst: 'dict[str, str]' = {} - def to_rst(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol', **kwargs) -> str: ... + @abstractmethod + def to_rst(self, inner: 'list[Symbol]', symbol:'T', parent:'Symbol') -> 'str': """Generate a reStructuredText representation for a symbol. - def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol', *args, **kwargs) -> 'list[Symbol]': ... +Subclasses must implement this method to convert the current symbol into an RST +formatted string by utilizing its nested inner symbols and the context provided by +the parent symbol. +""" +... - def verify(self, text) -> bool: ... \ No newline at end of file + def prepare(self, inner:'list[Symbol]', symbol:'T', parent:'Symbol'): """ +Prepares the symbols for reStructuredText conversion. + +This method processes a list of symbols along with a primary symbol and its parent +to perform any necessary preparation before generating reStructuredText output. +Subclasses may override this method to implement custom pre-processing logic. + +Args: + inner: A list of symbols representing inner content. + symbol: The primary symbol to be processed. + parent: The parent symbol associated with the primary symbol. +""" +... + + def verify(self, text) -> 'bool': """ +Verifies whether the provided text meets the expected criteria. + +This method checks if the given text adheres to the required rules for custom +reStructuredText processing. Subclasses should override this method to implement +specific validation logic. + +Args: + text: The text content to validate. + +Returns: + True if the text passes verification; otherwise, False. +""" +... \ No newline at end of file